From 3783b9bbde4a6fd0e22459c0119552cce5f5a825 Mon Sep 17 00:00:00 2001
From: "Node.js GitHub Bot" <github-bot@iojs.org>
Date: Tue, 17 Dec 2024 08:07:33 -0500
Subject: [PATCH] deps: update simdutf to 5.6.4

PR-URL: https://github.com/nodejs/node/pull/56255
Reviewed-By: Rafael Gonzaga <rafael.nunu@hotmail.com>
Reviewed-By: Luigi Pinca <luigipinca@gmail.com>
---
 deps/simdutf/simdutf.cpp | 63241 +++++++++++++++++++++++--------------
 deps/simdutf/simdutf.h   |    37 +-
 2 files changed, 39212 insertions(+), 24066 deletions(-)

diff --git a/deps/simdutf/simdutf.cpp b/deps/simdutf/simdutf.cpp
index 007fa02b165204..eb3e4598407374 100644
--- a/deps/simdutf/simdutf.cpp
+++ b/deps/simdutf/simdutf.cpp
@@ -1,4 +1,4 @@
-/* auto-generated on 2024-11-21 10:33:28 -0500. Do not edit! */
+/* auto-generated on 2024-12-10 14:54:53 -0500. Do not edit! */
 /* begin file src/simdutf.cpp */
 #include "simdutf.h"
 // We include base64_tables once.
@@ -6410,43 +6410,42 @@ SIMDUTF_UNTARGET_REGION
 
 #endif // SIMDUTF_RVV_H
 /* end file src/simdutf/rvv.h */
-/* begin file src/simdutf/fallback.h */
-#ifndef SIMDUTF_FALLBACK_H
-#define SIMDUTF_FALLBACK_H
+/* begin file src/simdutf/lsx.h */
+#ifndef SIMDUTF_LSX_H
+#define SIMDUTF_LSX_H
 
+#ifdef SIMDUTF_FALLBACK_H
+  #error "lsx.h must be included before fallback.h"
+#endif
 
-// Note that fallback.h is always imported last.
 
-// Default Fallback to on unless a builtin implementation has already been
-// selected.
-#ifndef SIMDUTF_IMPLEMENTATION_FALLBACK
-  #if SIMDUTF_CAN_ALWAYS_RUN_ARM64 || SIMDUTF_CAN_ALWAYS_RUN_ICELAKE ||        \
-      SIMDUTF_CAN_ALWAYS_RUN_HASWELL || SIMDUTF_CAN_ALWAYS_RUN_WESTMERE ||     \
-      SIMDUTF_CAN_ALWAYS_RUN_PPC64 || SIMDUTF_CAN_ALWAYS_RUN_RVV
-    #define SIMDUTF_IMPLEMENTATION_FALLBACK 0
-  #else
-    #define SIMDUTF_IMPLEMENTATION_FALLBACK 1
-  #endif
+#ifndef SIMDUTF_IMPLEMENTATION_LSX
+  #define SIMDUTF_IMPLEMENTATION_LSX (SIMDUTF_IS_LSX)
+#endif
+#if SIMDUTF_IMPLEMENTATION_LSX && SIMDUTF_IS_LSX
+  #define SIMDUTF_CAN_ALWAYS_RUN_LSX 1
+#else
+  #define SIMDUTF_CAN_ALWAYS_RUN_LSX 0
 #endif
 
 #define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK (SIMDUTF_IMPLEMENTATION_FALLBACK)
 
-#if SIMDUTF_IMPLEMENTATION_FALLBACK
+#if SIMDUTF_IMPLEMENTATION_LSX
 
 namespace simdutf {
 /**
- * Fallback implementation (runs on any machine).
+ * Implementation for LoongArch SX.
  */
-namespace fallback {} // namespace fallback
+namespace lsx {} // namespace lsx
 } // namespace simdutf
 
-/* begin file src/simdutf/fallback/implementation.h */
-#ifndef SIMDUTF_FALLBACK_IMPLEMENTATION_H
-#define SIMDUTF_FALLBACK_IMPLEMENTATION_H
+/* begin file src/simdutf/lsx/implementation.h */
+#ifndef SIMDUTF_LSX_IMPLEMENTATION_H
+#define SIMDUTF_LSX_IMPLEMENTATION_H
 
 
 namespace simdutf {
-namespace fallback {
+namespace lsx {
 
 namespace {
 using namespace simdutf;
@@ -6455,8 +6454,8 @@ using namespace simdutf;
 class implementation final : public simdutf::implementation {
 public:
   simdutf_really_inline implementation()
-      : simdutf::implementation("fallback", "Generic fallback implementation",
-                                0) {}
+      : simdutf::implementation("lsx", "LOONGARCH SX",
+                                internal::instruction_set::LSX) {}
   simdutf_warn_unused int detect_encodings(const char *input,
                                            size_t length) const noexcept final;
   simdutf_warn_unused bool validate_utf8(const char *buf,
@@ -6541,12 +6540,6 @@ class implementation final : public simdutf::implementation {
       const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
   simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
       const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf8(
-      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
-      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
-      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
   simdutf_warn_unused size_t
   convert_utf32_to_latin1(const char32_t *buf, size_t len,
                           char *latin1_output) const noexcept final;
@@ -6556,6 +6549,12 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t
   convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
                                 char *latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf8(
+      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
+      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
   simdutf_warn_unused size_t
   convert_utf32_to_utf16le(const char32_t *buf, size_t len,
                            char16_t *utf16_buffer) const noexcept final;
@@ -6630,3980 +6629,6754 @@ class implementation final : public simdutf::implementation {
   utf8_length_from_latin1(const char *input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(
       const char *input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(
-      const char *input, size_t length, char *output, base64_options options,
-      last_chunk_handling_options last_chunk_options) const noexcept;
-  simdutf_warn_unused full_result base64_to_binary_details(
-      const char *input, size_t length, char *output, base64_options options,
-      last_chunk_handling_options last_chunk_options =
-          last_chunk_handling_options::loose) const noexcept;
+  simdutf_warn_unused result
+  base64_to_binary(const char *input, size_t length, char *output,
+                   base64_options options) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(
       const char16_t *input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(
-      const char16_t *input, size_t length, char *output,
-      base64_options options,
-      last_chunk_handling_options last_chunk_options) const noexcept;
+  simdutf_warn_unused result
+  base64_to_binary(const char16_t *input, size_t length, char *output,
+                   base64_options options) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(
       size_t length, base64_options options) const noexcept;
-  simdutf_warn_unused full_result base64_to_binary_details(
+  size_t binary_to_base64(const char *input, size_t length, char *output,
+                          base64_options options) const noexcept;
+
+  simdutf_warn_unused virtual result
+  base64_to_binary(const char *input, size_t length, char *output,
+                   base64_options options,
+                   last_chunk_handling_options last_chunk_options =
+                       last_chunk_handling_options::loose) const noexcept;
+  simdutf_warn_unused virtual full_result base64_to_binary_details(
+      const char *input, size_t length, char *output, base64_options options,
+      last_chunk_handling_options last_chunk_options =
+          last_chunk_handling_options::loose) const noexcept;
+  simdutf_warn_unused virtual result
+  base64_to_binary(const char16_t *input, size_t length, char *output,
+                   base64_options options,
+                   last_chunk_handling_options last_chunk_options =
+                       last_chunk_handling_options::loose) const noexcept;
+  simdutf_warn_unused virtual full_result base64_to_binary_details(
       const char16_t *input, size_t length, char *output,
       base64_options options,
       last_chunk_handling_options last_chunk_options =
           last_chunk_handling_options::loose) const noexcept;
-  size_t binary_to_base64(const char *input, size_t length, char *output,
-                          base64_options options) const noexcept;
 };
-} // namespace fallback
+
+} // namespace lsx
 } // namespace simdutf
 
-#endif // SIMDUTF_FALLBACK_IMPLEMENTATION_H
-/* end file src/simdutf/fallback/implementation.h */
+#endif // SIMDUTF_LSX_IMPLEMENTATION_H
+/* end file src/simdutf/lsx/implementation.h */
 
-/* begin file src/simdutf/fallback/begin.h */
-// redefining SIMDUTF_IMPLEMENTATION to "fallback"
-// #define SIMDUTF_IMPLEMENTATION fallback
-/* end file src/simdutf/fallback/begin.h */
+/* begin file src/simdutf/lsx/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "lsx"
+// #define SIMDUTF_IMPLEMENTATION lsx
+/* end file src/simdutf/lsx/begin.h */
 
   // Declarations
-/* begin file src/simdutf/fallback/bitmanipulation.h */
-#ifndef SIMDUTF_FALLBACK_BITMANIPULATION_H
-#define SIMDUTF_FALLBACK_BITMANIPULATION_H
+/* begin file src/simdutf/lsx/intrinsics.h */
+#ifndef SIMDUTF_LSX_INTRINSICS_H
+#define SIMDUTF_LSX_INTRINSICS_H
+
+
+// This should be the correct header whether
+// you use visual studio or other compilers.
+#include <lsxintrin.h>
+
+#endif //  SIMDUTF_LSX_INTRINSICS_H
+/* end file src/simdutf/lsx/intrinsics.h */
+/* begin file src/simdutf/lsx/bitmanipulation.h */
+#ifndef SIMDUTF_LSX_BITMANIPULATION_H
+#define SIMDUTF_LSX_BITMANIPULATION_H
 
 #include <limits>
 
 namespace simdutf {
-namespace fallback {
-namespace {} // unnamed namespace
-} // namespace fallback
-} // namespace simdutf
+namespace lsx {
+namespace {
 
-#endif // SIMDUTF_FALLBACK_BITMANIPULATION_H
-/* end file src/simdutf/fallback/bitmanipulation.h */
+simdutf_really_inline int count_ones(uint64_t input_num) {
+  return __lsx_vpickve2gr_w(__lsx_vpcnt_d(__lsx_vreplgr2vr_d(input_num)), 0);
+}
 
-/* begin file src/simdutf/fallback/end.h */
-/* end file src/simdutf/fallback/end.h */
+#if SIMDUTF_NEED_TRAILING_ZEROES
+simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
+  return __builtin_ctzll(input_num);
+}
+#endif
 
-#endif // SIMDUTF_IMPLEMENTATION_FALLBACK
-#endif // SIMDUTF_FALLBACK_H
-/* end file src/simdutf/fallback.h */
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
 
-/* begin file src/scalar/utf8.h */
-#ifndef SIMDUTF_UTF8_H
-#define SIMDUTF_UTF8_H
+#endif // SIMDUTF_LSX_BITMANIPULATION_H
+/* end file src/simdutf/lsx/bitmanipulation.h */
+/* begin file src/simdutf/lsx/simd.h */
+#ifndef SIMDUTF_LSX_SIMD_H
+#define SIMDUTF_LSX_SIMD_H
+
+#include <type_traits>
 
 namespace simdutf {
-namespace scalar {
+namespace lsx {
 namespace {
-namespace utf8 {
-#if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_RVV
-// only used by the fallback kernel.
-// credit: based on code from Google Fuchsia (Apache Licensed)
-inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  uint64_t pos = 0;
-  uint32_t code_point = 0;
-  while (pos < len) {
-    // check of the next 16 bytes are ascii.
-    uint64_t next_pos = pos + 16;
-    if (next_pos <=
-        len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      std::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
-        pos = next_pos;
-        continue;
-      }
-    }
-    unsigned char byte = data[pos];
+namespace simd {
 
-    while (byte < 0b10000000) {
-      if (++pos == len) {
-        return true;
-      }
-      byte = data[pos];
-    }
+template <typename T> struct simd8;
 
-    if ((byte & 0b11100000) == 0b11000000) {
-      next_pos = pos + 2;
-      if (next_pos > len) {
-        return false;
-      }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return false;
-      }
-      // range check
-      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if ((code_point < 0x80) || (0x7ff < code_point)) {
-        return false;
-      }
-    } else if ((byte & 0b11110000) == 0b11100000) {
-      next_pos = pos + 3;
-      if (next_pos > len) {
-        return false;
-      }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return false;
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return false;
-      }
-      // range check
-      code_point = (byte & 0b00001111) << 12 |
-                   (data[pos + 1] & 0b00111111) << 6 |
-                   (data[pos + 2] & 0b00111111);
-      if ((code_point < 0x800) || (0xffff < code_point) ||
-          (0xd7ff < code_point && code_point < 0xe000)) {
-        return false;
-      }
-    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
-      next_pos = pos + 4;
-      if (next_pos > len) {
-        return false;
-      }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return false;
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return false;
-      }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
-        return false;
-      }
-      // range check
-      code_point =
-          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
-          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff || 0x10ffff < code_point) {
-        return false;
-      }
-    } else {
-      // we may have a continuation
-      return false;
-    }
-    pos = next_pos;
-  }
-  return true;
-}
-#endif
+//
+// Base class of simd8<uint8_t> and simd8<bool>, both of which use __m128i
+// internally.
+//
+template <typename T, typename Mask = simd8<bool>> struct base_u8 {
+  __m128i value;
+  static const int SIZE = sizeof(value);
 
-inline simdutf_warn_unused result validate_with_errors(const char *buf,
-                                                       size_t len) noexcept {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  uint32_t code_point = 0;
-  while (pos < len) {
-    // check of the next 16 bytes are ascii.
-    size_t next_pos = pos + 16;
-    if (next_pos <=
-        len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      std::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
-        pos = next_pos;
-        continue;
-      }
-    }
-    unsigned char byte = data[pos];
+  // Conversion from/to SIMD register
+  simdutf_really_inline base_u8(const __m128i _value) : value(_value) {}
+  simdutf_really_inline operator const __m128i &() const { return this->value; }
+  simdutf_really_inline operator __m128i &() { return this->value; }
+  simdutf_really_inline T first() const {
+    return __lsx_vpickve2gr_bu(this->value, 0);
+  }
+  simdutf_really_inline T last() const {
+    return __lsx_vpickve2gr_bu(this->value, 15);
+  }
 
-    while (byte < 0b10000000) {
-      if (++pos == len) {
-        return result(error_code::SUCCESS, len);
-      }
-      byte = data[pos];
-    }
+  // Bit operations
+  simdutf_really_inline simd8<T> operator|(const simd8<T> other) const {
+    return __lsx_vor_v(this->value, other);
+  }
+  simdutf_really_inline simd8<T> operator&(const simd8<T> other) const {
+    return __lsx_vand_v(this->value, other);
+  }
+  simdutf_really_inline simd8<T> operator^(const simd8<T> other) const {
+    return __lsx_vxor_v(this->value, other);
+  }
+  simdutf_really_inline simd8<T> bit_andnot(const simd8<T> other) const {
+    return __lsx_vandn_v(this->value, other);
+  }
+  simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+  simdutf_really_inline simd8<T> &operator|=(const simd8<T> other) {
+    auto this_cast = static_cast<simd8<T> *>(this);
+    *this_cast = *this_cast | other;
+    return *this_cast;
+  }
+  simdutf_really_inline simd8<T> &operator&=(const simd8<T> other) {
+    auto this_cast = static_cast<simd8<T> *>(this);
+    *this_cast = *this_cast & other;
+    return *this_cast;
+  }
+  simdutf_really_inline simd8<T> &operator^=(const simd8<T> other) {
+    auto this_cast = static_cast<simd8<T> *>(this);
+    *this_cast = *this_cast ^ other;
+    return *this_cast;
+  }
 
-    if ((byte & 0b11100000) == 0b11000000) {
-      next_pos = pos + 2;
-      if (next_pos > len) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      // range check
-      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if ((code_point < 0x80) || (0x7ff < code_point)) {
-        return result(error_code::OVERLONG, pos);
-      }
-    } else if ((byte & 0b11110000) == 0b11100000) {
-      next_pos = pos + 3;
-      if (next_pos > len) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      // range check
-      code_point = (byte & 0b00001111) << 12 |
-                   (data[pos + 1] & 0b00111111) << 6 |
-                   (data[pos + 2] & 0b00111111);
-      if ((code_point < 0x800) || (0xffff < code_point)) {
-        return result(error_code::OVERLONG, pos);
-      }
-      if (0xd7ff < code_point && code_point < 0xe000) {
-        return result(error_code::SURROGATE, pos);
-      }
-    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
-      next_pos = pos + 4;
-      if (next_pos > len) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      // range check
-      code_point =
-          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
-          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff) {
-        return result(error_code::OVERLONG, pos);
-      }
-      if (0x10ffff < code_point) {
-        return result(error_code::TOO_LARGE, pos);
-      }
-    } else {
-      // we either have too many continuation bytes or an invalid leading byte
-      if ((byte & 0b11000000) == 0b10000000) {
-        return result(error_code::TOO_LONG, pos);
-      } else {
-        return result(error_code::HEADER_BITS, pos);
-      }
-    }
-    pos = next_pos;
+  friend simdutf_really_inline Mask operator==(const simd8<T> lhs,
+                                               const simd8<T> rhs) {
+    return __lsx_vseq_b(lhs, rhs);
   }
-  return result(error_code::SUCCESS, len);
-}
 
-// Finds the previous leading byte starting backward from buf and validates with
-// errors from there Used to pinpoint the location of an error when an invalid
-// chunk is detected We assume that the stream starts with a leading byte, and
-// to check that it is the case, we ask that you pass a pointer to the start of
-// the stream (start).
-inline simdutf_warn_unused result rewind_and_validate_with_errors(
-    const char *start, const char *buf, size_t len) noexcept {
-  // First check that we start with a leading byte
-  if ((*start & 0b11000000) == 0b10000000) {
-    return result(error_code::TOO_LONG, 0);
+  template <int N = 1>
+  simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
+    return __lsx_vor_v(__lsx_vbsll_v(this->value, N),
+                       __lsx_vbsrl_v(prev_chunk.value, 16 - N));
   }
-  size_t extra_len{0};
-  // A leading byte cannot be further than 4 bytes away
-  for (int i = 0; i < 5; i++) {
-    unsigned char byte = *buf;
-    if ((byte & 0b11000000) != 0b10000000) {
-      break;
-    } else {
-      buf--;
-      extra_len++;
-    }
+};
+
+// SIMD byte mask type (returned by things like eq and gt)
+template <> struct simd8<bool> : base_u8<bool> {
+  typedef uint16_t bitmask_t;
+  typedef uint32_t bitmask2_t;
+
+  static simdutf_really_inline simd8<bool> splat(bool _value) {
+    return __lsx_vreplgr2vr_b(uint8_t(-(!!_value)));
   }
 
-  result res = validate_with_errors(buf, len + extra_len);
-  res.count -= extra_len;
-  return res;
-}
+  simdutf_really_inline simd8(const __m128i _value) : base_u8<bool>(_value) {}
+  // False constructor
+  simdutf_really_inline simd8() : simd8(__lsx_vldi(0)) {}
+  // Splat constructor
+  simdutf_really_inline simd8(bool _value) : simd8(splat(_value)) {}
+  simdutf_really_inline void store(uint8_t dst[16]) const {
+    return __lsx_vst(this->value, dst, 0);
+  }
 
-inline size_t count_code_points(const char *buf, size_t len) {
-  const int8_t *p = reinterpret_cast<const int8_t *>(buf);
-  size_t counter{0};
-  for (size_t i = 0; i < len; i++) {
-    // -65 is 0b10111111, anything larger in two-complement's should start a new
-    // code point.
-    if (p[i] > -65) {
-      counter++;
-    }
+  simdutf_really_inline uint32_t to_bitmask() const {
+    return __lsx_vpickve2gr_wu(__lsx_vmsknz_b(*this), 0);
   }
-  return counter;
-}
 
-inline size_t utf16_length_from_utf8(const char *buf, size_t len) {
-  const int8_t *p = reinterpret_cast<const int8_t *>(buf);
-  size_t counter{0};
-  for (size_t i = 0; i < len; i++) {
-    if (p[i] > -65) {
-      counter++;
-    }
-    if (uint8_t(p[i]) >= 240) {
-      counter++;
-    }
+  simdutf_really_inline bool any() const {
+    return __lsx_vpickve2gr_hu(__lsx_vmsknz_b(*this), 0) != 0;
   }
-  return counter;
-}
+  simdutf_really_inline bool none() const {
+    return __lsx_vpickve2gr_hu(__lsx_vmsknz_b(*this), 0) == 0;
+  }
+  simdutf_really_inline bool all() const {
+    return __lsx_vpickve2gr_hu(__lsx_vmsknz_b(*this), 0) == 0xFFFF;
+  }
+};
 
-simdutf_warn_unused inline size_t trim_partial_utf8(const char *input,
-                                                    size_t length) {
-  if (length < 3) {
-    switch (length) {
-    case 2:
-      if (uint8_t(input[length - 1]) >= 0xc0) {
-        return length - 1;
-      } // 2-, 3- and 4-byte characters with only 1 byte left
-      if (uint8_t(input[length - 2]) >= 0xe0) {
-        return length - 2;
-      } // 3- and 4-byte characters with only 2 bytes left
-      return length;
-    case 1:
-      if (uint8_t(input[length - 1]) >= 0xc0) {
-        return length - 1;
-      } // 2-, 3- and 4-byte characters with only 1 byte left
-      return length;
-    case 0:
-      return length;
-    }
+// Unsigned bytes
+template <> struct simd8<uint8_t> : base_u8<uint8_t> {
+  static simdutf_really_inline simd8<uint8_t> splat(uint8_t _value) {
+    return __lsx_vreplgr2vr_b(_value);
   }
-  if (uint8_t(input[length - 1]) >= 0xc0) {
-    return length - 1;
-  } // 2-, 3- and 4-byte characters with only 1 byte left
-  if (uint8_t(input[length - 2]) >= 0xe0) {
-    return length - 2;
-  } // 3- and 4-byte characters with only 1 byte left
-  if (uint8_t(input[length - 3]) >= 0xf0) {
-    return length - 3;
-  } // 4-byte characters with only 3 bytes left
-  return length;
-}
+  static simdutf_really_inline simd8<uint8_t> zero() { return __lsx_vldi(0); }
+  static simdutf_really_inline simd8<uint8_t> load(const uint8_t *values) {
+    return __lsx_vld(values, 0);
+  }
+  simdutf_really_inline simd8(const __m128i _value)
+      : base_u8<uint8_t>(_value) {}
+  // Zero constructor
+  simdutf_really_inline simd8() : simd8(zero()) {}
+  // Array constructor
+  simdutf_really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
+  // Splat constructor
+  simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+  // Member-by-member initialization
 
-} // namespace utf8
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+  simdutf_really_inline
+  simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
+        uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
+        uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+      : simd8((__m128i)v16u8{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+                             v12, v13, v14, v15}) {}
 
-#endif
-/* end file src/scalar/utf8.h */
-/* begin file src/scalar/utf16.h */
-#ifndef SIMDUTF_UTF16_H
-#define SIMDUTF_UTF16_H
+  // Repeat 16 values as many times as necessary (usually for lookup tables)
+  simdutf_really_inline static simd8<uint8_t>
+  repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
+            uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
+            uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
+            uint8_t v15) {
+    return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+                          v13, v14, v15);
+  }
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf16 {
+  // Store to array
+  simdutf_really_inline void store(uint8_t dst[16]) const {
+    return __lsx_vst(this->value, dst, 0);
+  }
 
-inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word) {
-  return uint16_t((word >> 8) | (word << 8));
-}
+  // Saturated math
+  simdutf_really_inline simd8<uint8_t>
+  saturating_add(const simd8<uint8_t> other) const {
+    return __lsx_vsadd_bu(this->value, other);
+  }
+  simdutf_really_inline simd8<uint8_t>
+  saturating_sub(const simd8<uint8_t> other) const {
+    return __lsx_vssub_bu(this->value, other);
+  }
 
-template <endianness big_endian>
-inline simdutf_warn_unused bool validate(const char16_t *buf,
-                                         size_t len) noexcept {
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  uint64_t pos = 0;
-  while (pos < len) {
-    uint16_t word =
-        !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
-    if ((word & 0xF800) == 0xD800) {
-      if (pos + 1 >= len) {
-        return false;
-      }
-      uint16_t diff = uint16_t(word - 0xD800);
-      if (diff > 0x3FF) {
-        return false;
-      }
-      uint16_t next_word =
-          !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      if (diff2 > 0x3FF) {
-        return false;
-      }
-      pos += 2;
-    } else {
-      pos++;
-    }
+  // Addition/subtraction are the same for signed and unsigned
+  simdutf_really_inline simd8<uint8_t>
+  operator+(const simd8<uint8_t> other) const {
+    return __lsx_vadd_b(this->value, other);
+  }
+  simdutf_really_inline simd8<uint8_t>
+  operator-(const simd8<uint8_t> other) const {
+    return __lsx_vsub_b(this->value, other);
+  }
+  simdutf_really_inline simd8<uint8_t> &operator+=(const simd8<uint8_t> other) {
+    *this = *this + other;
+    return *this;
+  }
+  simdutf_really_inline simd8<uint8_t> &operator-=(const simd8<uint8_t> other) {
+    *this = *this - other;
+    return *this;
   }
-  return true;
-}
 
-template <endianness big_endian>
-inline simdutf_warn_unused result validate_with_errors(const char16_t *buf,
-                                                       size_t len) noexcept {
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  while (pos < len) {
-    uint16_t word =
-        !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
-    if ((word & 0xF800) == 0xD800) {
-      if (pos + 1 >= len) {
-        return result(error_code::SURROGATE, pos);
-      }
-      uint16_t diff = uint16_t(word - 0xD800);
-      if (diff > 0x3FF) {
-        return result(error_code::SURROGATE, pos);
-      }
-      uint16_t next_word =
-          !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      if (diff2 > 0x3FF) {
-        return result(error_code::SURROGATE, pos);
-      }
-      pos += 2;
+  // Order-specific operations
+  simdutf_really_inline simd8<uint8_t>
+  max_val(const simd8<uint8_t> other) const {
+    return __lsx_vmax_bu(*this, other);
+  }
+  simdutf_really_inline simd8<uint8_t>
+  min_val(const simd8<uint8_t> other) const {
+    return __lsx_vmin_bu(*this, other);
+  }
+  simdutf_really_inline simd8<bool>
+  operator<=(const simd8<uint8_t> other) const {
+    return __lsx_vsle_bu(*this, other);
+  }
+  simdutf_really_inline simd8<bool>
+  operator>=(const simd8<uint8_t> other) const {
+    return __lsx_vsle_bu(other, *this);
+  }
+  simdutf_really_inline simd8<bool>
+  operator<(const simd8<uint8_t> other) const {
+    return __lsx_vslt_bu(*this, other);
+  }
+  simdutf_really_inline simd8<bool>
+  operator>(const simd8<uint8_t> other) const {
+    return __lsx_vslt_bu(other, *this);
+  }
+  // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true
+  // = nonzero. For ARM, returns all 1's.
+  simdutf_really_inline simd8<uint8_t>
+  gt_bits(const simd8<uint8_t> other) const {
+    return simd8<uint8_t>(*this > other);
+  }
+  // Same as <, but instead of guaranteeing all 1's == true, false = 0 and true
+  // = nonzero. For ARM, returns all 1's.
+  simdutf_really_inline simd8<uint8_t>
+  lt_bits(const simd8<uint8_t> other) const {
+    return simd8<uint8_t>(*this < other);
+  }
+
+  // Bit-specific operations
+  simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
+    return __lsx_vslt_bu(__lsx_vldi(0), __lsx_vand_v(this->value, bits));
+  }
+  simdutf_really_inline bool is_ascii() const {
+    return __lsx_vpickve2gr_hu(__lsx_vmskgez_b(this->value), 0) == 0xFFFF;
+  }
+
+  simdutf_really_inline bool any_bits_set_anywhere() const {
+    return __lsx_vpickve2gr_hu(__lsx_vmsknz_b(this->value), 0) > 0;
+  }
+  simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
+    return (*this & bits).any_bits_set_anywhere();
+  }
+  template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
+    return __lsx_vsrli_b(this->value, N);
+  }
+  template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
+    return __lsx_vslli_b(this->value, N);
+  }
+
+  // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
+  // for out of range values)
+  template <typename L>
+  simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+    return lookup_table.apply_lookup_16_to(*this);
+  }
+
+  template <typename L>
+  simdutf_really_inline simd8<L>
+  lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
+            L replace5, L replace6, L replace7, L replace8, L replace9,
+            L replace10, L replace11, L replace12, L replace13, L replace14,
+            L replace15) const {
+    return lookup_16(simd8<L>::repeat_16(
+        replace0, replace1, replace2, replace3, replace4, replace5, replace6,
+        replace7, replace8, replace9, replace10, replace11, replace12,
+        replace13, replace14, replace15));
+  }
+
+  template <typename T>
+  simdutf_really_inline simd8<uint8_t>
+  apply_lookup_16_to(const simd8<T> original) const {
+    __m128i original_tmp = __lsx_vand_v(original, __lsx_vldi(0x1f));
+    return __lsx_vshuf_b(__lsx_vldi(0), *this, simd8<uint8_t>(original_tmp));
+  }
+};
+
+// Signed bytes
+template <> struct simd8<int8_t> {
+  __m128i value;
+
+  static simdutf_really_inline simd8<int8_t> splat(int8_t _value) {
+    return __lsx_vreplgr2vr_b(_value);
+  }
+  static simdutf_really_inline simd8<int8_t> zero() { return __lsx_vldi(0); }
+  static simdutf_really_inline simd8<int8_t> load(const int8_t values[16]) {
+    return __lsx_vld(values, 0);
+  }
+
+  template <endianness big_endian>
+  simdutf_really_inline void store_ascii_as_utf16(char16_t *p) const {
+    __m128i zero = __lsx_vldi(0);
+    if (match_system(big_endian)) {
+      __lsx_vst(__lsx_vilvl_b(zero, (__m128i)this->value),
+                reinterpret_cast<uint16_t *>(p), 0);
+      __lsx_vst(__lsx_vilvh_b(zero, (__m128i)this->value),
+                reinterpret_cast<uint16_t *>(p + 8), 0);
     } else {
-      pos++;
+      __lsx_vst(__lsx_vilvl_b((__m128i)this->value, zero),
+                reinterpret_cast<uint16_t *>(p), 0);
+      __lsx_vst(__lsx_vilvh_b((__m128i)this->value, zero),
+                reinterpret_cast<uint16_t *>(p + 8), 0);
     }
   }
-  return result(error_code::SUCCESS, pos);
-}
 
-template <endianness big_endian>
-inline size_t count_code_points(const char16_t *buf, size_t len) {
-  // We are not BOM aware.
-  const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
-  size_t counter{0};
-  for (size_t i = 0; i < len; i++) {
-    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
-    counter += ((word & 0xFC00) != 0xDC00);
+  simdutf_really_inline void store_ascii_as_utf32(char32_t *p) const {
+    __m128i zero = __lsx_vldi(0);
+    __m128i in16low = __lsx_vilvl_b(zero, (__m128i)this->value);
+    __m128i in16high = __lsx_vilvh_b(zero, (__m128i)this->value);
+    __m128i in32_0 = __lsx_vilvl_h(zero, in16low);
+    __m128i in32_1 = __lsx_vilvh_h(zero, in16low);
+    __m128i in32_2 = __lsx_vilvl_h(zero, in16high);
+    __m128i in32_3 = __lsx_vilvh_h(zero, in16high);
+    __lsx_vst(in32_0, reinterpret_cast<uint32_t *>(p), 0);
+    __lsx_vst(in32_1, reinterpret_cast<uint32_t *>(p + 4), 0);
+    __lsx_vst(in32_2, reinterpret_cast<uint32_t *>(p + 8), 0);
+    __lsx_vst(in32_3, reinterpret_cast<uint32_t *>(p + 12), 0);
   }
-  return counter;
-}
 
-template <endianness big_endian>
-inline size_t utf8_length_from_utf16(const char16_t *buf, size_t len) {
-  // We are not BOM aware.
-  const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
-  size_t counter{0};
-  for (size_t i = 0; i < len; i++) {
-    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
-    counter++; // ASCII
-    counter += static_cast<size_t>(
-        word >
-        0x7F); // non-ASCII is at least 2 bytes, surrogates are 2*2 == 4 bytes
-    counter += static_cast<size_t>((word > 0x7FF && word <= 0xD7FF) ||
-                                   (word >= 0xE000)); // three-byte
-  }
-  return counter;
-}
+  // In places where the table can be reused, which is most uses in simdutf, it
+  // is worth it to do 4 table lookups, as there is no direct zero extension
+  // from u8 to u32.
+  simdutf_really_inline void store_ascii_as_utf32_tbl(char32_t *p) const {
+    const simd8<uint8_t> tb1{0, 255, 255, 255, 1, 255, 255, 255,
+                             2, 255, 255, 255, 3, 255, 255, 255};
+    const simd8<uint8_t> tb2{4, 255, 255, 255, 5, 255, 255, 255,
+                             6, 255, 255, 255, 7, 255, 255, 255};
+    const simd8<uint8_t> tb3{8,  255, 255, 255, 9,  255, 255, 255,
+                             10, 255, 255, 255, 11, 255, 255, 255};
+    const simd8<uint8_t> tb4{12, 255, 255, 255, 13, 255, 255, 255,
+                             14, 255, 255, 255, 15, 255, 255, 255};
 
-template <endianness big_endian>
-inline size_t utf32_length_from_utf16(const char16_t *buf, size_t len) {
-  // We are not BOM aware.
-  const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
-  size_t counter{0};
-  for (size_t i = 0; i < len; i++) {
-    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
-    counter += ((word & 0xFC00) != 0xDC00);
+    // encourage store pairing and interleaving
+    const auto shuf1 = this->apply_lookup_16_to(tb1);
+    const auto shuf2 = this->apply_lookup_16_to(tb2);
+    shuf1.store(reinterpret_cast<int8_t *>(p));
+    shuf2.store(reinterpret_cast<int8_t *>(p + 4));
+
+    const auto shuf3 = this->apply_lookup_16_to(tb3);
+    const auto shuf4 = this->apply_lookup_16_to(tb4);
+    shuf3.store(reinterpret_cast<int8_t *>(p + 8));
+    shuf4.store(reinterpret_cast<int8_t *>(p + 12));
   }
-  return counter;
-}
+  // Conversion from/to SIMD register
+  simdutf_really_inline simd8(const __m128i _value) : value(_value) {}
+  simdutf_really_inline operator const __m128i &() const { return this->value; }
 
-inline size_t latin1_length_from_utf16(size_t len) { return len; }
+  simdutf_really_inline operator const __m128i() const { return this->value; }
 
-simdutf_really_inline void change_endianness_utf16(const char16_t *in,
-                                                   size_t size, char16_t *out) {
-  const uint16_t *input = reinterpret_cast<const uint16_t *>(in);
-  uint16_t *output = reinterpret_cast<uint16_t *>(out);
-  for (size_t i = 0; i < size; i++) {
-    *output++ = uint16_t(input[i] >> 8 | input[i] << 8);
+  simdutf_really_inline operator __m128i &() { return this->value; }
+
+  // Zero constructor
+  simdutf_really_inline simd8() : simd8(zero()) {}
+  // Splat constructor
+  simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {}
+  // Member-by-member initialization
+
+  simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
+                              int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+                              int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+                              int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+      : simd8((__m128i)v16i8{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+                             v12, v13, v14, v15}) {}
+
+  // Repeat 16 values as many times as necessary (usually for lookup tables)
+  simdutf_really_inline static simd8<int8_t>
+  repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
+            int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+            int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
+    return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+                         v13, v14, v15);
   }
-}
 
-template <endianness big_endian>
-simdutf_warn_unused inline size_t trim_partial_utf16(const char16_t *input,
-                                                     size_t length) {
-  if (length <= 1) {
-    return length;
+  // Store to array
+  simdutf_really_inline void store(int8_t dst[16]) const {
+    return __lsx_vst(value, dst, 0);
   }
-  uint16_t last_word = uint16_t(input[length - 1]);
-  last_word = !match_system(big_endian) ? swap_bytes(last_word) : last_word;
-  length -= ((last_word & 0xFC00) == 0xD800);
-  return length;
-}
 
-} // namespace utf16
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+  simdutf_really_inline operator simd8<uint8_t>() const {
+    return ((__m128i)this->value);
+  }
 
-#endif
-/* end file src/scalar/utf16.h */
-/* begin file src/scalar/utf32.h */
-#ifndef SIMDUTF_UTF32_H
-#define SIMDUTF_UTF32_H
+  simdutf_really_inline simd8<int8_t>
+  operator|(const simd8<int8_t> other) const {
+    return __lsx_vor_v((__m128i)value, (__m128i)other.value);
+  }
+  simdutf_really_inline simd8<int8_t>
+  operator&(const simd8<int8_t> other) const {
+    return __lsx_vand_v((__m128i)value, (__m128i)other.value);
+  }
+  simdutf_really_inline simd8<int8_t>
+  operator^(const simd8<int8_t> other) const {
+    return __lsx_vxor_v((__m128i)value, (__m128i)other.value);
+  }
+  simdutf_really_inline simd8<int8_t>
+  bit_andnot(const simd8<int8_t> other) const {
+    return __lsx_vandn_v((__m128i)other.value, (__m128i)value);
+  }
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf32 {
+  // Math
+  simdutf_really_inline simd8<int8_t>
+  operator+(const simd8<int8_t> other) const {
+    return __lsx_vadd_b((__m128i)value, (__m128i)other.value);
+  }
+  simdutf_really_inline simd8<int8_t>
+  operator-(const simd8<int8_t> other) const {
+    return __lsx_vsub_b((__m128i)value, (__m128i)other.value);
+  }
+  simdutf_really_inline simd8<int8_t> &operator+=(const simd8<int8_t> other) {
+    *this = *this + other;
+    return *this;
+  }
+  simdutf_really_inline simd8<int8_t> &operator-=(const simd8<int8_t> other) {
+    *this = *this - other;
+    return *this;
+  }
 
-inline simdutf_warn_unused bool validate(const char32_t *buf,
-                                         size_t len) noexcept {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  uint64_t pos = 0;
-  for (; pos < len; pos++) {
-    uint32_t word = data[pos];
-    if (word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
-      return false;
-    }
+  simdutf_really_inline bool is_ascii() const {
+    return (__lsx_vpickve2gr_hu(__lsx_vmskgez_b((__m128i)this->value), 0) ==
+            0xffff);
   }
-  return true;
-}
 
-inline simdutf_warn_unused result validate_with_errors(const char32_t *buf,
-                                                       size_t len) noexcept {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  for (; pos < len; pos++) {
-    uint32_t word = data[pos];
-    if (word > 0x10FFFF) {
-      return result(error_code::TOO_LARGE, pos);
-    }
-    if (word >= 0xD800 && word <= 0xDFFF) {
-      return result(error_code::SURROGATE, pos);
-    }
+  // Order-sensitive comparisons
+  simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const {
+    return __lsx_vmax_b((__m128i)value, (__m128i)other.value);
+  }
+  simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const {
+    return __lsx_vmin_b((__m128i)value, (__m128i)other.value);
+  }
+  simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const {
+    return __lsx_vslt_b((__m128i)other.value, (__m128i)value);
+  }
+  simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const {
+    return __lsx_vslt_b((__m128i)value, (__m128i)other.value);
+  }
+  simdutf_really_inline simd8<bool>
+  operator==(const simd8<int8_t> other) const {
+    return __lsx_vseq_b((__m128i)value, (__m128i)other.value);
   }
-  return result(error_code::SUCCESS, pos);
-}
 
-inline size_t utf8_length_from_utf32(const char32_t *buf, size_t len) {
-  // We are not BOM aware.
-  const uint32_t *p = reinterpret_cast<const uint32_t *>(buf);
-  size_t counter{0};
-  for (size_t i = 0; i < len; i++) {
-    // credit: @ttsugriy  for the vectorizable approach
-    counter++;                                     // ASCII
-    counter += static_cast<size_t>(p[i] > 0x7F);   // two-byte
-    counter += static_cast<size_t>(p[i] > 0x7FF);  // three-byte
-    counter += static_cast<size_t>(p[i] > 0xFFFF); // four-bytes
+  template <int N = 1>
+  simdutf_really_inline simd8<int8_t>
+  prev(const simd8<int8_t> prev_chunk) const {
+    return __lsx_vor_v(__lsx_vbsll_v(this->value, N),
+                       __lsx_vbsrl_v(prev_chunk.value, 16 - N));
   }
-  return counter;
-}
 
-inline size_t utf16_length_from_utf32(const char32_t *buf, size_t len) {
-  // We are not BOM aware.
-  const uint32_t *p = reinterpret_cast<const uint32_t *>(buf);
-  size_t counter{0};
-  for (size_t i = 0; i < len; i++) {
-    counter++;                                     // non-surrogate word
-    counter += static_cast<size_t>(p[i] > 0xFFFF); // surrogate pair
+  // Perform a lookup assuming no value is larger than 16
+  template <typename L>
+  simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+    return lookup_table.apply_lookup_16_to(*this);
+  }
+  template <typename L>
+  simdutf_really_inline simd8<L>
+  lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
+            L replace5, L replace6, L replace7, L replace8, L replace9,
+            L replace10, L replace11, L replace12, L replace13, L replace14,
+            L replace15) const {
+    return lookup_16(simd8<L>::repeat_16(
+        replace0, replace1, replace2, replace3, replace4, replace5, replace6,
+        replace7, replace8, replace9, replace10, replace11, replace12,
+        replace13, replace14, replace15));
   }
-  return counter;
-}
 
-inline size_t latin1_length_from_utf32(size_t len) {
-  // We are not BOM aware.
-  return len; // a utf32 codepoint will always represent 1 latin1 character
-}
+  template <typename T>
+  simdutf_really_inline simd8<int8_t>
+  apply_lookup_16_to(const simd8<T> original) const {
+    __m128i original_tmp = __lsx_vand_v(original, __lsx_vldi(0x1f));
+    return __lsx_vshuf_b(__lsx_vldi(0), (__m128i)this->value,
+                         simd8<uint8_t>(original_tmp));
+  }
+};
 
-inline simdutf_warn_unused uint32_t swap_bytes(const uint32_t word) {
-  return ((word >> 24) & 0xff) |      // move byte 3 to byte 0
-         ((word << 8) & 0xff0000) |   // move byte 1 to byte 2
-         ((word >> 8) & 0xff00) |     // move byte 2 to byte 1
-         ((word << 24) & 0xff000000); // byte 0 to byte 3
-}
+template <typename T> struct simd8x64 {
+  static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+  static_assert(
+      NUM_CHUNKS == 4,
+      "LoongArch kernel should use four registers per 64-byte block.");
+  simd8<T> chunks[NUM_CHUNKS];
 
-} // namespace utf32
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+  simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
+  simd8x64<T> &
+  operator=(const simd8<T> other) = delete; // no assignment allowed
+  simd8x64() = delete;                      // no default constructor allowed
 
-#endif
-/* end file src/scalar/utf32.h */
-/* begin file src/scalar/base64.h */
-#ifndef SIMDUTF_BASE64_H
-#define SIMDUTF_BASE64_H
+  simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
+                                 const simd8<T> chunk2, const simd8<T> chunk3)
+      : chunks{chunk0, chunk1, chunk2, chunk3} {}
+  simdutf_really_inline simd8x64(const T *ptr)
+      : chunks{simd8<T>::load(ptr),
+               simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)),
+               simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)),
+               simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T))} {}
 
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <iostream>
+  simdutf_really_inline void store(T *ptr) const {
+    this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
+    this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
+    this->chunks[2].store(ptr + sizeof(simd8<T>) * 2 / sizeof(T));
+    this->chunks[3].store(ptr + sizeof(simd8<T>) * 3 / sizeof(T));
+  }
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace base64 {
+  simdutf_really_inline simd8x64<T> &operator|=(const simd8x64<T> &other) {
+    this->chunks[0] |= other.chunks[0];
+    this->chunks[1] |= other.chunks[1];
+    this->chunks[2] |= other.chunks[2];
+    this->chunks[3] |= other.chunks[3];
+    return *this;
+  }
 
-// This function is not expected to be fast. Do not use in long loops.
-template <class char_type> bool is_ascii_white_space(char_type c) {
-  return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f';
-}
+  simdutf_really_inline simd8<T> reduce_or() const {
+    return (this->chunks[0] | this->chunks[1]) |
+           (this->chunks[2] | this->chunks[3]);
+  }
 
-template <class char_type> bool is_ascii_white_space_or_padding(char_type c) {
-  return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' ||
-         c == '=';
-}
+  simdutf_really_inline bool is_ascii() const { return reduce_or().is_ascii(); }
 
-template <class char_type> bool is_eight_byte(char_type c) {
-  if (sizeof(char_type) == 1) {
-    return true;
+  template <endianness endian>
+  simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
+    this->chunks[0].template store_ascii_as_utf16<endian>(ptr +
+                                                          sizeof(simd8<T>) * 0);
+    this->chunks[1].template store_ascii_as_utf16<endian>(ptr +
+                                                          sizeof(simd8<T>) * 1);
+    this->chunks[2].template store_ascii_as_utf16<endian>(ptr +
+                                                          sizeof(simd8<T>) * 2);
+    this->chunks[3].template store_ascii_as_utf16<endian>(ptr +
+                                                          sizeof(simd8<T>) * 3);
   }
-  return uint8_t(c) == c;
-}
 
-// Returns true upon success. The destination buffer must be large enough.
-// This functions assumes that the padding (=) has been removed.
-template <class char_type>
-full_result
-base64_tail_decode(char *dst, const char_type *src, size_t length,
-                   size_t padded_characters, // number of padding characters
-                                             // '=', typically 0, 1, 2.
-                   base64_options options,
-                   last_chunk_handling_options last_chunk_options) {
-  // This looks like 5 branches, but we expect the compiler to resolve this to a
-  // single branch:
-  const uint8_t *to_base64 = (options & base64_url)
-                                 ? tables::base64::to_base64_url_value
-                                 : tables::base64::to_base64_value;
-  const uint32_t *d0 = (options & base64_url)
-                           ? tables::base64::base64_url::d0
-                           : tables::base64::base64_default::d0;
-  const uint32_t *d1 = (options & base64_url)
-                           ? tables::base64::base64_url::d1
-                           : tables::base64::base64_default::d1;
-  const uint32_t *d2 = (options & base64_url)
-                           ? tables::base64::base64_url::d2
-                           : tables::base64::base64_default::d2;
-  const uint32_t *d3 = (options & base64_url)
-                           ? tables::base64::base64_url::d3
-                           : tables::base64::base64_default::d3;
+  simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
+    this->chunks[0].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 0);
+    this->chunks[1].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 1);
+    this->chunks[2].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 2);
+    this->chunks[3].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 3);
+  }
 
-  const char_type *srcend = src + length;
-  const char_type *srcinit = src;
-  const char *dstinit = dst;
+  simdutf_really_inline uint64_t to_bitmask() const {
+    __m128i mask = __lsx_vbsll_v(__lsx_vmsknz_b(this->chunks[3]), 6);
+    mask = __lsx_vor_v(mask, __lsx_vbsll_v(__lsx_vmsknz_b(this->chunks[2]), 4));
+    mask = __lsx_vor_v(mask, __lsx_vbsll_v(__lsx_vmsknz_b(this->chunks[1]), 2));
+    mask = __lsx_vor_v(mask, __lsx_vmsknz_b(this->chunks[0]));
+    return __lsx_vpickve2gr_du(mask, 0);
+  }
 
-  uint32_t x;
-  size_t idx;
-  uint8_t buffer[4];
-  while (true) {
-    while (src + 4 <= srcend && is_eight_byte(src[0]) &&
-           is_eight_byte(src[1]) && is_eight_byte(src[2]) &&
-           is_eight_byte(src[3]) &&
-           (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
-                d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
-      if (match_system(endianness::BIG)) {
-        x = scalar::utf32::swap_bytes(x);
-      }
-      std::memcpy(dst, &x, 3); // optimization opportunity: copy 4 bytes
-      dst += 3;
-      src += 4;
-    }
-    idx = 0;
-    // we need at least four characters.
-    while (idx < 4 && src < srcend) {
-      char_type c = *src;
-      uint8_t code = to_base64[uint8_t(c)];
-      buffer[idx] = uint8_t(code);
-      if (is_eight_byte(c) && code <= 63) {
-        idx++;
-      } else if (code > 64 || !scalar::base64::is_eight_byte(c)) {
-        return {INVALID_BASE64_CHARACTER, size_t(src - srcinit),
-                size_t(dst - dstinit)};
-      } else {
-        // We have a space or a newline. We ignore it.
-      }
-      src++;
-    }
-    if (idx != 4) {
-      if (last_chunk_options == last_chunk_handling_options::strict &&
-          (idx != 1) && ((idx + padded_characters) & 3) != 0) {
-        // The partial chunk was at src - idx
-        return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
-                size_t(dst - dstinit)};
-      } else if (last_chunk_options ==
-                     last_chunk_handling_options::stop_before_partial &&
-                 (idx != 1) && ((idx + padded_characters) & 3) != 0) {
-        // Rewind src to before partial chunk
-        src -= idx;
-        return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
-      } else {
-        if (idx == 2) {
-          uint32_t triple =
-              (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6);
-          if ((last_chunk_options == last_chunk_handling_options::strict) &&
-              (triple & 0xffff)) {
-            return {BASE64_EXTRA_BITS, size_t(src - srcinit),
-                    size_t(dst - dstinit)};
-          }
-          if (match_system(endianness::BIG)) {
-            triple <<= 8;
-            std::memcpy(dst, &triple, 1);
-          } else {
-            triple = scalar::utf32::swap_bytes(triple);
-            triple >>= 8;
-            std::memcpy(dst, &triple, 1);
-          }
-          dst += 1;
-        } else if (idx == 3) {
-          uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) +
-                            (uint32_t(buffer[1]) << 2 * 6) +
-                            (uint32_t(buffer[2]) << 1 * 6);
-          if ((last_chunk_options == last_chunk_handling_options::strict) &&
-              (triple & 0xff)) {
-            return {BASE64_EXTRA_BITS, size_t(src - srcinit),
-                    size_t(dst - dstinit)};
-          }
-          if (match_system(endianness::BIG)) {
-            triple <<= 8;
-            std::memcpy(dst, &triple, 2);
-          } else {
-            triple = scalar::utf32::swap_bytes(triple);
-            triple >>= 8;
-            std::memcpy(dst, &triple, 2);
-          }
-          dst += 2;
-        } else if (idx == 1) {
-          return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
-                  size_t(dst - dstinit)};
-        }
-        return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
-      }
-    }
-
-    uint32_t triple =
-        (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6) +
-        (uint32_t(buffer[2]) << 1 * 6) + (uint32_t(buffer[3]) << 0 * 6);
-    if (match_system(endianness::BIG)) {
-      triple <<= 8;
-      std::memcpy(dst, &triple, 3);
-    } else {
-      triple = scalar::utf32::swap_bytes(triple);
-      triple >>= 8;
-      std::memcpy(dst, &triple, 3);
-    }
-    dst += 3;
+  simdutf_really_inline uint64_t eq(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
+                          this->chunks[2] == mask, this->chunks[3] == mask)
+        .to_bitmask();
   }
-}
 
-// like base64_tail_decode, but it will not write past the end of the output
-// buffer. The outlen paramter is modified to reflect the number of bytes
-// written. This functions assumes that the padding (=) has been removed.
-template <class char_type>
-result base64_tail_decode_safe(
-    char *dst, size_t &outlen, const char_type *&srcr, size_t length,
-    size_t padded_characters, // number of padding characters '=', typically 0,
-                              // 1, 2.
-    base64_options options, last_chunk_handling_options last_chunk_options) {
-  const char_type *src = srcr;
-  if (length == 0) {
-    outlen = 0;
-    return {SUCCESS, 0};
+  simdutf_really_inline uint64_t lteq(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
+                          this->chunks[2] <= mask, this->chunks[3] <= mask)
+        .to_bitmask();
   }
-  // This looks like 5 branches, but we expect the compiler to resolve this to a
-  // single branch:
-  const uint8_t *to_base64 = (options & base64_url)
-                                 ? tables::base64::to_base64_url_value
-                                 : tables::base64::to_base64_value;
-  const uint32_t *d0 = (options & base64_url)
-                           ? tables::base64::base64_url::d0
-                           : tables::base64::base64_default::d0;
-  const uint32_t *d1 = (options & base64_url)
-                           ? tables::base64::base64_url::d1
-                           : tables::base64::base64_default::d1;
-  const uint32_t *d2 = (options & base64_url)
-                           ? tables::base64::base64_url::d2
-                           : tables::base64::base64_default::d2;
-  const uint32_t *d3 = (options & base64_url)
-                           ? tables::base64::base64_url::d3
-                           : tables::base64::base64_default::d3;
 
-  const char_type *srcend = src + length;
-  const char_type *srcinit = src;
-  const char *dstinit = dst;
-  const char *dstend = dst + outlen;
+  simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+    const simd8<T> mask_low = simd8<T>::splat(low);
+    const simd8<T> mask_high = simd8<T>::splat(high);
 
-  uint32_t x;
-  size_t idx;
-  uint8_t buffer[4];
-  while (true) {
-    while (src + 4 <= srcend && is_eight_byte(src[0]) &&
-           is_eight_byte(src[1]) && is_eight_byte(src[2]) &&
-           is_eight_byte(src[3]) &&
-           (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
-                d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
-      if (dstend - dst < 3) {
-        outlen = size_t(dst - dstinit);
-        srcr = src;
-        return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit)};
-      }
-      if (match_system(endianness::BIG)) {
-        x = scalar::utf32::swap_bytes(x);
-      }
-      std::memcpy(dst, &x, 3); // optimization opportunity: copy 4 bytes
-      dst += 3;
-      src += 4;
-    }
-    idx = 0;
-    const char_type *srccur = src;
-    // We need at least four characters.
-    while (idx < 4 && src < srcend) {
-      char_type c = *src;
-      uint8_t code = to_base64[uint8_t(c)];
+    return simd8x64<bool>(
+               (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+               (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+               (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+               (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+        .to_bitmask();
+  }
+  simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+    const simd8<T> mask_low = simd8<T>::splat(low);
+    const simd8<T> mask_high = simd8<T>::splat(high);
+    return simd8x64<bool>(
+               (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+               (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
+               (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
+               (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
+        .to_bitmask();
+  }
+  simdutf_really_inline uint64_t lt(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
+                          this->chunks[2] < mask, this->chunks[3] < mask)
+        .to_bitmask();
+  }
+  simdutf_really_inline uint64_t gt(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] > mask, this->chunks[1] > mask,
+                          this->chunks[2] > mask, this->chunks[3] > mask)
+        .to_bitmask();
+  }
+  simdutf_really_inline uint64_t gteq(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] >= mask, this->chunks[1] >= mask,
+                          this->chunks[2] >= mask, this->chunks[3] >= mask)
+        .to_bitmask();
+  }
+  simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
+    const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+    return simd8x64<bool>(simd8<uint8_t>(this->chunks[0].value) >= mask,
+                          simd8<uint8_t>(this->chunks[1].value) >= mask,
+                          simd8<uint8_t>(this->chunks[2].value) >= mask,
+                          simd8<uint8_t>(this->chunks[3].value) >= mask)
+        .to_bitmask();
+  }
+}; // struct simd8x64<T>
+/* begin file src/simdutf/lsx/simd16-inl.h */
+template <typename T> struct simd16;
 
-      buffer[idx] = uint8_t(code);
-      if (is_eight_byte(c) && code <= 63) {
-        idx++;
-      } else if (code > 64 || !scalar::base64::is_eight_byte(c)) {
-        outlen = size_t(dst - dstinit);
-        srcr = src;
-        return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
-      } else {
-        // We have a space or a newline. We ignore it.
-      }
-      src++;
-    }
-    if (idx != 4) {
-      if (last_chunk_options == last_chunk_handling_options::strict &&
-          ((idx + padded_characters) & 3) != 0) {
-        outlen = size_t(dst - dstinit);
-        srcr = src;
-        return {BASE64_INPUT_REMAINDER, size_t(src - srcinit)};
-      } else if (last_chunk_options ==
-                     last_chunk_handling_options::stop_before_partial &&
-                 ((idx + padded_characters) & 3) != 0) {
-        // Rewind src to before partial chunk
-        srcr = srccur;
-        outlen = size_t(dst - dstinit);
-        return {SUCCESS, size_t(dst - dstinit)};
-      } else { // loose mode
-        if (idx == 0) {
-          // No data left; return success
-          outlen = size_t(dst - dstinit);
-          srcr = src;
-          return {SUCCESS, size_t(dst - dstinit)};
-        } else if (idx == 1) {
-          // Error: Incomplete chunk of length 1 is invalid in loose mode
-          outlen = size_t(dst - dstinit);
-          srcr = src;
-          return {BASE64_INPUT_REMAINDER, size_t(src - srcinit)};
-        } else if (idx == 2 || idx == 3) {
-          // Check if there's enough space in the destination buffer
-          size_t required_space = (idx == 2) ? 1 : 2;
-          if (size_t(dstend - dst) < required_space) {
-            outlen = size_t(dst - dstinit);
-            srcr = src;
-            return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
-          }
-          uint32_t triple = 0;
-          if (idx == 2) {
-            triple = (uint32_t(buffer[0]) << 18) + (uint32_t(buffer[1]) << 12);
-            if ((last_chunk_options == last_chunk_handling_options::strict) &&
-                (triple & 0xffff)) {
-              srcr = src;
-              return {BASE64_EXTRA_BITS, size_t(src - srcinit)};
-            }
-            // Extract the first byte
-            triple >>= 16;
-            dst[0] = static_cast<char>(triple & 0xFF);
-            dst += 1;
-          } else if (idx == 3) {
-            triple = (uint32_t(buffer[0]) << 18) + (uint32_t(buffer[1]) << 12) +
-                     (uint32_t(buffer[2]) << 6);
-            if ((last_chunk_options == last_chunk_handling_options::strict) &&
-                (triple & 0xff)) {
-              srcr = src;
-              return {BASE64_EXTRA_BITS, size_t(src - srcinit)};
-            }
-            // Extract the first two bytes
-            triple >>= 8;
-            dst[0] = static_cast<char>((triple >> 8) & 0xFF);
-            dst[1] = static_cast<char>(triple & 0xFF);
-            dst += 2;
-          }
-          outlen = size_t(dst - dstinit);
-          srcr = src;
-          return {SUCCESS, size_t(dst - dstinit)};
-        }
-      }
-    }
+template <typename T, typename Mask = simd16<bool>> struct base_u16 {
+  __m128i value;
+  static const int SIZE = sizeof(value);
 
-    if (dstend - dst < 3) {
-      outlen = size_t(dst - dstinit);
-      srcr = src;
-      return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
-    }
-    uint32_t triple = (uint32_t(buffer[0]) << 18) +
-                      (uint32_t(buffer[1]) << 12) + (uint32_t(buffer[2]) << 6) +
-                      (uint32_t(buffer[3]));
-    if (match_system(endianness::BIG)) {
-      triple <<= 8;
-      std::memcpy(dst, &triple, 3);
-    } else {
-      triple = scalar::utf32::swap_bytes(triple);
-      triple >>= 8;
-      std::memcpy(dst, &triple, 3);
-    }
-    dst += 3;
+  // Conversion from/to SIMD register
+  simdutf_really_inline base_u16() = default;
+  simdutf_really_inline base_u16(const __m128i _value) : value(_value) {}
+  // Bit operations
+  simdutf_really_inline simd16<T> operator|(const simd16<T> other) const {
+    return __lsx_vor_v(this->value, other.value);
   }
-}
-
-// Returns the number of bytes written. The destination buffer must be large
-// enough. It will add padding (=) if needed.
-size_t tail_encode_base64(char *dst, const char *src, size_t srclen,
-                          base64_options options) {
-  // By default, we use padding if we are not using the URL variant.
-  // This is check with ((options & base64_url) == 0) which returns true if we
-  // are not using the URL variant. However, we also allow 'inversion' of the
-  // convention with the base64_reverse_padding option. If the
-  // base64_reverse_padding option is set, we use padding if we are using the
-  // URL variant, and we omit it if we are not using the URL variant. This is
-  // checked with
-  // ((options & base64_reverse_padding) == base64_reverse_padding).
-  bool use_padding =
-      ((options & base64_url) == 0) ^
-      ((options & base64_reverse_padding) == base64_reverse_padding);
-  // This looks like 3 branches, but we expect the compiler to resolve this to
-  // a single branch:
-  const char *e0 = (options & base64_url) ? tables::base64::base64_url::e0
-                                          : tables::base64::base64_default::e0;
-  const char *e1 = (options & base64_url) ? tables::base64::base64_url::e1
-                                          : tables::base64::base64_default::e1;
-  const char *e2 = (options & base64_url) ? tables::base64::base64_url::e2
-                                          : tables::base64::base64_default::e2;
-  char *out = dst;
-  size_t i = 0;
-  uint8_t t1, t2, t3;
-  for (; i + 2 < srclen; i += 3) {
-    t1 = uint8_t(src[i]);
-    t2 = uint8_t(src[i + 1]);
-    t3 = uint8_t(src[i + 2]);
-    *out++ = e0[t1];
-    *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
-    *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
-    *out++ = e2[t3];
+  simdutf_really_inline simd16<T> operator&(const simd16<T> other) const {
+    return __lsx_vand_v(this->value, other.value);
   }
-  switch (srclen - i) {
-  case 0:
-    break;
-  case 1:
-    t1 = uint8_t(src[i]);
-    *out++ = e0[t1];
-    *out++ = e1[(t1 & 0x03) << 4];
-    if (use_padding) {
-      *out++ = '=';
-      *out++ = '=';
-    }
-    break;
-  default: /* case 2 */
-    t1 = uint8_t(src[i]);
-    t2 = uint8_t(src[i + 1]);
-    *out++ = e0[t1];
-    *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
-    *out++ = e2[(t2 & 0x0F) << 2];
-    if (use_padding) {
-      *out++ = '=';
-    }
+  simdutf_really_inline simd16<T> operator^(const simd16<T> other) const {
+    return __lsx_vxor_v(this->value, other.value);
+  }
+  simdutf_really_inline simd16<T> bit_andnot(const simd16<T> other) const {
+    return __lsx_vandn_v(this->value, other.value);
+  }
+  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
+  simdutf_really_inline simd16<T> &operator|=(const simd16<T> other) {
+    auto this_cast = static_cast<simd16<T> *>(this);
+    *this_cast = *this_cast | other;
+    return *this_cast;
+  }
+  simdutf_really_inline simd16<T> &operator&=(const simd16<T> other) {
+    auto this_cast = static_cast<simd16<T> *>(this);
+    *this_cast = *this_cast & other;
+    return *this_cast;
+  }
+  simdutf_really_inline simd16<T> &operator^=(const simd16<T> other) {
+    auto this_cast = static_cast<simd16<T> *>(this);
+    *this_cast = *this_cast ^ other;
+    return *this_cast;
   }
-  return (size_t)(out - dst);
-}
 
-template <class char_type>
-simdutf_warn_unused size_t maximal_binary_length_from_base64(
-    const char_type *input, size_t length) noexcept {
-  // We follow https://infra.spec.whatwg.org/#forgiving-base64-decode
-  size_t padding = 0;
-  if (length > 0) {
-    if (input[length - 1] == '=') {
-      padding++;
-      if (length > 1 && input[length - 2] == '=') {
-        padding++;
-      }
-    }
+  friend simdutf_really_inline Mask operator==(const simd16<T> lhs,
+                                               const simd16<T> rhs) {
+    return __lsx_vseq_h(lhs.value, rhs.value);
   }
-  size_t actual_length = length - padding;
-  if (actual_length % 4 <= 1) {
-    return actual_length / 4 * 3;
+
+  template <int N = 1>
+  simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
+    return __lsx_vor_v(__lsx_vbsll_v(*this, N * 2),
+                       __lsx_vbsrl_v(prev_chunk, 16 - N * 2));
   }
-  // if we have a valid input, then the remainder must be 2 or 3 adding one or
-  // two extra bytes.
-  return actual_length / 4 * 3 + (actual_length % 4) - 1;
-}
+};
 
-simdutf_warn_unused size_t
-base64_length_from_binary(size_t length, base64_options options) noexcept {
-  // By default, we use padding if we are not using the URL variant.
-  // This is check with ((options & base64_url) == 0) which returns true if we
-  // are not using the URL variant. However, we also allow 'inversion' of the
-  // convention with the base64_reverse_padding option. If the
-  // base64_reverse_padding option is set, we use padding if we are using the
-  // URL variant, and we omit it if we are not using the URL variant. This is
-  // checked with
-  // ((options & base64_reverse_padding) == base64_reverse_padding).
-  bool use_padding =
-      ((options & base64_url) == 0) ^
-      ((options & base64_reverse_padding) == base64_reverse_padding);
-  if (!use_padding) {
-    return length / 3 * 4 + ((length % 3) ? (length % 3) + 1 : 0);
+template <typename T, typename Mask = simd16<bool>>
+struct base16 : base_u16<T> {
+  typedef uint16_t bitmask_t;
+  typedef uint32_t bitmask2_t;
+
+  simdutf_really_inline base16() : base_u16<T>() {}
+  simdutf_really_inline base16(const __m128i _value) : base_u16<T>(_value) {}
+  template <typename Pointer>
+  simdutf_really_inline base16(const Pointer *ptr)
+      : base16(__lsx_vld(ptr, 0)) {}
+
+  static const int SIZE = sizeof(base_u16<T>::value);
+
+  template <int N = 1>
+  simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
+    return __lsx_vor_v(__lsx_vbsll_v(*this, N * 2),
+                       __lsx_vbsrl_v(prev_chunk, 16 - N * 2));
   }
-  return (length + 2) / 3 *
-         4; // We use padding to make the length a multiple of 4.
-}
+};
 
-} // namespace base64
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+// SIMD byte mask type (returned by things like eq and gt)
+template <> struct simd16<bool> : base16<bool> {
+  static simdutf_really_inline simd16<bool> splat(bool _value) {
+    return __lsx_vreplgr2vr_h(uint16_t(-(!!_value)));
+  }
 
-#endif
-/* end file src/scalar/base64.h */
-/* begin file src/scalar/latin1_to_utf8/latin1_to_utf8.h */
-#ifndef SIMDUTF_LATIN1_TO_UTF8_H
-#define SIMDUTF_LATIN1_TO_UTF8_H
+  simdutf_really_inline simd16() : base16() {}
+  simdutf_really_inline simd16(const __m128i _value) : base16<bool>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd16(bool _value) : base16<bool>(splat(_value)) {}
+};
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace latin1_to_utf8 {
+template <typename T> struct base16_numeric : base16<T> {
+  static simdutf_really_inline simd16<T> splat(T _value) {
+    return __lsx_vreplgr2vr_h(_value);
+  }
+  static simdutf_really_inline simd16<T> zero() { return __lsx_vldi(0); }
+  static simdutf_really_inline simd16<T> load(const T values[8]) {
+    return __lsx_vld(reinterpret_cast<const uint16_t *>(values), 0);
+  }
 
-inline size_t convert(const char *buf, size_t len, char *utf8_output) {
-  const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
-  size_t pos = 0;
-  size_t utf8_pos = 0;
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <=
-        len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 |
-                 v2}; // We are only interested in these bits: 1000 1000 1000
-                      // 1000, so it makes sense to concatenate everything
-      if ((v & 0x8080808080808080) ==
-          0) { // if NONE of these are set, e.g. all of them are zero, then
-               // everything is ASCII
-        size_t final_pos = pos + 16;
-        while (pos < final_pos) {
-          utf8_output[utf8_pos++] = char(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
+  simdutf_really_inline base16_numeric() : base16<T>() {}
+  simdutf_really_inline base16_numeric(const __m128i _value)
+      : base16<T>(_value) {}
 
-    unsigned char byte = data[pos];
-    if ((byte & 0x80) == 0) { // if ASCII
-      // will generate one UTF-8 bytes
-      utf8_output[utf8_pos++] = char(byte);
-      pos++;
-    } else {
-      // will generate two UTF-8 bytes
-      utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000);
-      utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000);
-      pos++;
-    }
+  // Store to array
+  simdutf_really_inline void store(T dst[8]) const {
+    return __lsx_vst(this->value, dst, 0);
   }
-  return utf8_pos;
-}
 
-inline size_t convert_safe(const char *buf, size_t len, char *utf8_output,
-                           size_t utf8_len) {
-  const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
-  size_t pos = 0;
-  size_t skip_pos = 0;
-  size_t utf8_pos = 0;
-  while (pos < len && utf8_pos < utf8_len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos >= skip_pos && pos + 16 <= len &&
-        utf8_pos + 16 <= utf8_len) { // if it is safe to read 16 more bytes,
-                                     // check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 |
-                 v2}; // We are only interested in these bits: 1000 1000 1000
-                      // 1000, so it makes sense to concatenate everything
-      if ((v & 0x8080808080808080) ==
-          0) { // if NONE of these are set, e.g. all of them are zero, then
-               // everything is ASCII
-        ::memcpy(utf8_output + utf8_pos, buf + pos, 16);
-        utf8_pos += 16;
-        pos += 16;
-      } else {
-        // At least one of the next 16 bytes are not ASCII, we will process them
-        // one by one
-        skip_pos = pos + 16;
-      }
-    } else {
-      const auto byte = data[pos];
-      if ((byte & 0x80) == 0) { // if ASCII
-        // will generate one UTF-8 bytes
-        utf8_output[utf8_pos++] = char(byte);
-        pos++;
-      } else if (utf8_pos + 2 <= utf8_len) {
-        // will generate two UTF-8 bytes
-        utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000);
-        utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000);
-        pos++;
-      } else {
-        break;
-      }
-    }
-  }
-  return utf8_pos;
-}
-
-} // namespace latin1_to_utf8
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
-
-#endif
-/* end file src/scalar/latin1_to_utf8/latin1_to_utf8.h */
-
-namespace simdutf {
-bool implementation::supported_by_runtime_system() const {
-  uint32_t required_instruction_sets = this->required_instruction_sets();
-  uint32_t supported_instruction_sets =
-      internal::detect_supported_architectures();
-  return ((supported_instruction_sets & required_instruction_sets) ==
-          required_instruction_sets);
-}
+  // Override to distinguish from bool version
+  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
 
-simdutf_warn_unused encoding_type implementation::autodetect_encoding(
-    const char *input, size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  if (bom_encoding != encoding_type::unspecified) {
-    return bom_encoding;
+  // Addition/subtraction are the same for signed and unsigned
+  simdutf_really_inline simd16<T> operator+(const simd16<T> other) const {
+    return __lsx_vadd_b(*this, other);
   }
-  // UTF8 is common, it includes ASCII, and is commonly represented
-  // without a BOM, so if it fits, go with that. Note that it is still
-  // possible to get it wrong, we are only 'guessing'. If some has UTF-16
-  // data without a BOM, it could pass as UTF-8.
-  //
-  // An interesting twist might be to check for UTF-16 ASCII first (every
-  // other byte is zero).
-  if (validate_utf8(input, length)) {
-    return encoding_type::UTF8;
+  simdutf_really_inline simd16<T> operator-(const simd16<T> other) const {
+    return __lsx_vsub_b(*this, other);
   }
-  // The next most common encoding that might appear without BOM is probably
-  // UTF-16LE, so try that next.
-  if ((length % 2) == 0) {
-    // important: we need to divide by two
-    if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
-                         length / 2)) {
-      return encoding_type::UTF16_LE;
-    }
+  simdutf_really_inline simd16<T> &operator+=(const simd16<T> other) {
+    *this = *this + other;
+    return *static_cast<simd16<T> *>(this);
   }
-  if ((length % 4) == 0) {
-    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
-      return encoding_type::UTF32_LE;
-    }
+  simdutf_really_inline simd16<T> &operator-=(const simd16<T> other) {
+    *this = *this - other;
+    return *static_cast<simd16<T> *>(this);
   }
-  return encoding_type::unspecified;
-}
-
-namespace internal {
-// When there is a single implementation, we should not pay a price
-// for dispatching to the best implementation. We should just use the
-// one we have. This is a compile-time check.
-#define SIMDUTF_SINGLE_IMPLEMENTATION                                          \
-  (SIMDUTF_IMPLEMENTATION_ICELAKE + SIMDUTF_IMPLEMENTATION_HASWELL +           \
-       SIMDUTF_IMPLEMENTATION_WESTMERE + SIMDUTF_IMPLEMENTATION_ARM64 +        \
-       SIMDUTF_IMPLEMENTATION_PPC64 + SIMDUTF_IMPLEMENTATION_FALLBACK ==       \
-   1)
-
-// Static array of known implementations. We are hoping these get baked into the
-// executable without requiring a static initializer.
+};
 
-#if SIMDUTF_IMPLEMENTATION_ICELAKE
-static const icelake::implementation *get_icelake_singleton() {
-  static const icelake::implementation icelake_singleton{};
-  return &icelake_singleton;
-}
-#endif
-#if SIMDUTF_IMPLEMENTATION_HASWELL
-static const haswell::implementation *get_haswell_singleton() {
-  static const haswell::implementation haswell_singleton{};
-  return &haswell_singleton;
-}
-#endif
-#if SIMDUTF_IMPLEMENTATION_WESTMERE
-static const westmere::implementation *get_westmere_singleton() {
-  static const westmere::implementation westmere_singleton{};
-  return &westmere_singleton;
-}
-#endif
-#if SIMDUTF_IMPLEMENTATION_ARM64
-static const arm64::implementation *get_arm64_singleton() {
-  static const arm64::implementation arm64_singleton{};
-  return &arm64_singleton;
-}
-#endif
-#if SIMDUTF_IMPLEMENTATION_PPC64
-static const ppc64::implementation *get_ppc64_singleton() {
-  static const ppc64::implementation ppc64_singleton{};
-  return &ppc64_singleton;
-}
-#endif
-#if SIMDUTF_IMPLEMENTATION_RVV
-static const rvv::implementation *get_rvv_singleton() {
-  static const rvv::implementation rvv_singleton{};
-  return &rvv_singleton;
-}
-#endif
-#if SIMDUTF_IMPLEMENTATION_FALLBACK
-static const fallback::implementation *get_fallback_singleton() {
-  static const fallback::implementation fallback_singleton{};
-  return &fallback_singleton;
-}
-#endif
+// Signed code unitstemplate<>
+template <> struct simd16<int16_t> : base16_numeric<int16_t> {
+  simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
+  simdutf_really_inline simd16(const __m128i _value)
+      : base16_numeric<int16_t>(_value) {}
+  simdutf_really_inline simd16(simd16<bool> other)
+      : base16_numeric<int16_t>(other.value) {}
 
-#if SIMDUTF_SINGLE_IMPLEMENTATION
-static const implementation *get_single_implementation() {
-  return
-  #if SIMDUTF_IMPLEMENTATION_ICELAKE
-      get_icelake_singleton();
-  #endif
-  #if SIMDUTF_IMPLEMENTATION_HASWELL
-  get_haswell_singleton();
-  #endif
-  #if SIMDUTF_IMPLEMENTATION_WESTMERE
-  get_westmere_singleton();
-  #endif
-  #if SIMDUTF_IMPLEMENTATION_ARM64
-  get_arm64_singleton();
-  #endif
-  #if SIMDUTF_IMPLEMENTATION_PPC64
-  get_ppc64_singleton();
-  #endif
-  #if SIMDUTF_IMPLEMENTATION_FALLBACK
-  get_fallback_singleton();
-  #endif
-}
-#endif
+  // Splat constructor
+  simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd16(const int16_t *values) : simd16(load(values)) {}
+  simdutf_really_inline simd16(const char16_t *values)
+      : simd16(load(reinterpret_cast<const int16_t *>(values))) {}
+  simdutf_really_inline operator simd16<uint16_t>() const;
 
-/**
- * @private Detects best supported implementation on first use, and sets it
- */
-class detect_best_supported_implementation_on_first_use final
-    : public implementation {
-public:
-  std::string name() const noexcept final { return set_best()->name(); }
-  std::string description() const noexcept final {
-    return set_best()->description();
+  // Order-sensitive comparisons
+  simdutf_really_inline simd16<int16_t>
+  max_val(const simd16<int16_t> other) const {
+    return __lsx_vmax_h(this->value, other.value);
   }
-  uint32_t required_instruction_sets() const noexcept final {
-    return set_best()->required_instruction_sets();
+  simdutf_really_inline simd16<int16_t>
+  min_val(const simd16<int16_t> other) const {
+    return __lsx_vmin_h(this->value, other.value);
   }
-
-  simdutf_warn_unused int
-  detect_encodings(const char *input, size_t length) const noexcept override {
-    return set_best()->detect_encodings(input, length);
+  simdutf_really_inline simd16<bool>
+  operator>(const simd16<int16_t> other) const {
+    return __lsx_vsle_h(other.value, this->value);
   }
-
-  simdutf_warn_unused bool
-  validate_utf8(const char *buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf8(buf, len);
+  simdutf_really_inline simd16<bool>
+  operator<(const simd16<int16_t> other) const {
+    return __lsx_vslt_h(this->value, other.value);
   }
+};
 
-  simdutf_warn_unused result validate_utf8_with_errors(
-      const char *buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf8_with_errors(buf, len);
-  }
+// Unsigned code unitstemplate<>
+template <> struct simd16<uint16_t> : base16_numeric<uint16_t> {
+  simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
+  simdutf_really_inline simd16(const __m128i _value)
+      : base16_numeric<uint16_t>((__m128i)_value) {}
+  simdutf_really_inline simd16(simd16<bool> other)
+      : base16_numeric<uint16_t>(other.value) {}
 
-  simdutf_warn_unused bool
-  validate_ascii(const char *buf, size_t len) const noexcept final override {
-    return set_best()->validate_ascii(buf, len);
-  }
+  // Splat constructor
+  simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd16(const uint16_t *values) : simd16(load(values)) {}
+  simdutf_really_inline simd16(const char16_t *values)
+      : simd16(load(reinterpret_cast<const uint16_t *>(values))) {}
 
-  simdutf_warn_unused result validate_ascii_with_errors(
-      const char *buf, size_t len) const noexcept final override {
-    return set_best()->validate_ascii_with_errors(buf, len);
+  // Saturated math
+  simdutf_really_inline simd16<uint16_t>
+  saturating_add(const simd16<uint16_t> other) const {
+    return __lsx_vsadd_hu(this->value, other.value);
   }
-
-  simdutf_warn_unused bool
-  validate_utf16le(const char16_t *buf,
-                   size_t len) const noexcept final override {
-    return set_best()->validate_utf16le(buf, len);
+  simdutf_really_inline simd16<uint16_t>
+  saturating_sub(const simd16<uint16_t> other) const {
+    return __lsx_vssub_hu(this->value, other.value);
   }
 
-  simdutf_warn_unused bool
-  validate_utf16be(const char16_t *buf,
-                   size_t len) const noexcept final override {
-    return set_best()->validate_utf16be(buf, len);
+  // Order-specific operations
+  simdutf_really_inline simd16<uint16_t>
+  max_val(const simd16<uint16_t> other) const {
+    return __lsx_vmax_hu(this->value, other.value);
   }
-
-  simdutf_warn_unused result validate_utf16le_with_errors(
-      const char16_t *buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf16le_with_errors(buf, len);
+  simdutf_really_inline simd16<uint16_t>
+  min_val(const simd16<uint16_t> other) const {
+    return __lsx_vmin_hu(this->value, other.value);
   }
-
-  simdutf_warn_unused result validate_utf16be_with_errors(
-      const char16_t *buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf16be_with_errors(buf, len);
+  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd16<uint16_t>
+  gt_bits(const simd16<uint16_t> other) const {
+    return this->saturating_sub(other);
   }
-
-  simdutf_warn_unused bool
-  validate_utf32(const char32_t *buf,
-                 size_t len) const noexcept final override {
-    return set_best()->validate_utf32(buf, len);
+  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd16<uint16_t>
+  lt_bits(const simd16<uint16_t> other) const {
+    return other.saturating_sub(*this);
   }
-
-  simdutf_warn_unused result validate_utf32_with_errors(
-      const char32_t *buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf32_with_errors(buf, len);
+  simdutf_really_inline simd16<bool>
+  operator<=(const simd16<uint16_t> other) const {
+    return __lsx_vsle_hu(this->value, other.value);
   }
-
-  simdutf_warn_unused size_t
-  convert_latin1_to_utf8(const char *buf, size_t len,
-                         char *utf8_output) const noexcept final override {
-    return set_best()->convert_latin1_to_utf8(buf, len, utf8_output);
+  simdutf_really_inline simd16<bool>
+  operator>=(const simd16<uint16_t> other) const {
+    return __lsx_vsle_hu(other.value, this->value);
   }
-
-  simdutf_warn_unused size_t convert_latin1_to_utf16le(
-      const char *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_latin1_to_utf16le(buf, len, utf16_output);
+  simdutf_really_inline simd16<bool>
+  operator>(const simd16<uint16_t> other) const {
+    return __lsx_vslt_hu(other.value, this->value);
   }
-
-  simdutf_warn_unused size_t convert_latin1_to_utf16be(
-      const char *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_latin1_to_utf16be(buf, len, utf16_output);
+  simdutf_really_inline simd16<bool>
+  operator<(const simd16<uint16_t> other) const {
+    return __lsx_vslt_hu(this->value, other.value);
   }
 
-  simdutf_warn_unused size_t convert_latin1_to_utf32(
-      const char *buf, size_t len,
-      char32_t *latin1_output) const noexcept final override {
-    return set_best()->convert_latin1_to_utf32(buf, len, latin1_output);
+  // Bit-specific operations
+  simdutf_really_inline simd16<bool> bits_not_set() const {
+    return *this == uint16_t(0);
   }
-
-  simdutf_warn_unused size_t
-  convert_utf8_to_latin1(const char *buf, size_t len,
-                         char *latin1_output) const noexcept final override {
-    return set_best()->convert_utf8_to_latin1(buf, len, latin1_output);
+  template <int N> simdutf_really_inline simd16<uint16_t> shr() const {
+    return simd16<uint16_t>(__lsx_vsrli_h(this->value, N));
   }
-
-  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
-      const char *buf, size_t len,
-      char *latin1_output) const noexcept final override {
-    return set_best()->convert_utf8_to_latin1_with_errors(buf, len,
-                                                          latin1_output);
+  template <int N> simdutf_really_inline simd16<uint16_t> shl() const {
+    return simd16<uint16_t>(__lsx_vslli_h(this->value, N));
   }
 
-  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
-      const char *buf, size_t len,
-      char *latin1_output) const noexcept final override {
-    return set_best()->convert_valid_utf8_to_latin1(buf, len, latin1_output);
+  // logical operations
+  simdutf_really_inline simd16<uint16_t>
+  operator|(const simd16<uint16_t> other) const {
+    return __lsx_vor_v(this->value, other.value);
   }
-
-  simdutf_warn_unused size_t convert_utf8_to_utf16le(
-      const char *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf16le(buf, len, utf16_output);
+  simdutf_really_inline simd16<uint16_t>
+  operator&(const simd16<uint16_t> other) const {
+    return __lsx_vand_v(this->value, other.value);
   }
-
-  simdutf_warn_unused size_t convert_utf8_to_utf16be(
-      const char *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf16be(buf, len, utf16_output);
+  simdutf_really_inline simd16<uint16_t>
+  operator^(const simd16<uint16_t> other) const {
+    return __lsx_vxor_v(this->value, other.value);
   }
 
-  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
-      const char *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf16le_with_errors(buf, len,
-                                                           utf16_output);
+  // Pack with the unsigned saturation of two uint16_t code units into single
+  // uint8_t vector
+  static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t> &v0,
+                                                   const simd16<uint16_t> &v1) {
+    return __lsx_vssrlni_bu_h(v1.value, v0.value, 0);
   }
 
-  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
-      const char *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf16be_with_errors(buf, len,
-                                                           utf16_output);
+  // Change the endianness
+  simdutf_really_inline simd16<uint16_t> swap_bytes() const {
+    return __lsx_vshuf4i_b(this->value, 0b10110001);
   }
+};
 
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
-      const char *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_valid_utf8_to_utf16le(buf, len, utf16_output);
-  }
+simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const {
+  return this->value;
+}
 
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
-      const char *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_valid_utf8_to_utf16be(buf, len, utf16_output);
-  }
+template <typename T> struct simd16x32 {
+  static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
+  static_assert(
+      NUM_CHUNKS == 4,
+      "LOONGARCH kernel should use four registers per 64-byte block.");
+  simd16<T> chunks[NUM_CHUNKS];
 
-  simdutf_warn_unused size_t
-  convert_utf8_to_utf32(const char *buf, size_t len,
-                        char32_t *utf32_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf32(buf, len, utf32_output);
-  }
+  simd16x32(const simd16x32<T> &o) = delete; // no copy allowed
+  simd16x32<T> &
+  operator=(const simd16<T> other) = delete; // no assignment allowed
+  simd16x32() = delete;                      // no default constructor allowed
 
-  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
-      const char *buf, size_t len,
-      char32_t *utf32_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf32_with_errors(buf, len,
-                                                         utf32_output);
-  }
+  simdutf_really_inline
+  simd16x32(const simd16<T> chunk0, const simd16<T> chunk1,
+            const simd16<T> chunk2, const simd16<T> chunk3)
+      : chunks{chunk0, chunk1, chunk2, chunk3} {}
+  simdutf_really_inline simd16x32(const T *ptr)
+      : chunks{simd16<T>::load(ptr),
+               simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T)),
+               simd16<T>::load(ptr + 2 * sizeof(simd16<T>) / sizeof(T)),
+               simd16<T>::load(ptr + 3 * sizeof(simd16<T>) / sizeof(T))} {}
 
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
-      const char *buf, size_t len,
-      char32_t *utf32_output) const noexcept final override {
-    return set_best()->convert_valid_utf8_to_utf32(buf, len, utf32_output);
+  simdutf_really_inline void store(T *ptr) const {
+    this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
+    this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
+    this->chunks[2].store(ptr + sizeof(simd16<T>) * 2 / sizeof(T));
+    this->chunks[3].store(ptr + sizeof(simd16<T>) * 3 / sizeof(T));
   }
 
-  simdutf_warn_unused size_t
-  convert_utf16le_to_latin1(const char16_t *buf, size_t len,
-                            char *latin1_output) const noexcept final override {
-    return set_best()->convert_utf16le_to_latin1(buf, len, latin1_output);
+  simdutf_really_inline simd16<T> reduce_or() const {
+    return (this->chunks[0] | this->chunks[1]) |
+           (this->chunks[2] | this->chunks[3]);
   }
 
-  simdutf_warn_unused size_t
-  convert_utf16be_to_latin1(const char16_t *buf, size_t len,
-                            char *latin1_output) const noexcept final override {
-    return set_best()->convert_utf16be_to_latin1(buf, len, latin1_output);
-  }
+  simdutf_really_inline bool is_ascii() const { return reduce_or().is_ascii(); }
 
-  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
-      const char16_t *buf, size_t len,
-      char *latin1_output) const noexcept final override {
-    return set_best()->convert_utf16le_to_latin1_with_errors(buf, len,
-                                                             latin1_output);
+  simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
+    this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
+    this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 1);
+    this->chunks[2].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 2);
+    this->chunks[3].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 3);
   }
 
-  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
-      const char16_t *buf, size_t len,
-      char *latin1_output) const noexcept final override {
-    return set_best()->convert_utf16be_to_latin1_with_errors(buf, len,
-                                                             latin1_output);
+  simdutf_really_inline uint64_t to_bitmask() const {
+    __m128i mask = __lsx_vbsll_v(__lsx_vmsknz_b((this->chunks[3]).value), 6);
+    mask = __lsx_vor_v(
+        mask, __lsx_vbsll_v(__lsx_vmsknz_b((this->chunks[2]).value), 4));
+    mask = __lsx_vor_v(
+        mask, __lsx_vbsll_v(__lsx_vmsknz_b((this->chunks[1]).value), 2));
+    mask = __lsx_vor_v(mask, __lsx_vmsknz_b((this->chunks[0]).value));
+    return __lsx_vpickve2gr_du(mask, 0);
   }
 
-  simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
-      const char16_t *buf, size_t len,
-      char *latin1_output) const noexcept final override {
-    return set_best()->convert_valid_utf16le_to_latin1(buf, len, latin1_output);
+  simdutf_really_inline void swap_bytes() {
+    this->chunks[0] = this->chunks[0].swap_bytes();
+    this->chunks[1] = this->chunks[1].swap_bytes();
+    this->chunks[2] = this->chunks[2].swap_bytes();
+    this->chunks[3] = this->chunks[3].swap_bytes();
   }
 
-  simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
-      const char16_t *buf, size_t len,
-      char *latin1_output) const noexcept final override {
-    return set_best()->convert_valid_utf16be_to_latin1(buf, len, latin1_output);
+  simdutf_really_inline uint64_t eq(const T m) const {
+    const simd16<T> mask = simd16<T>::splat(m);
+    return simd16x32<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
+                           this->chunks[2] == mask, this->chunks[3] == mask)
+        .to_bitmask();
   }
 
-  simdutf_warn_unused size_t
-  convert_utf16le_to_utf8(const char16_t *buf, size_t len,
-                          char *utf8_output) const noexcept final override {
-    return set_best()->convert_utf16le_to_utf8(buf, len, utf8_output);
+  simdutf_really_inline uint64_t lteq(const T m) const {
+    const simd16<T> mask = simd16<T>::splat(m);
+    return simd16x32<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
+                           this->chunks[2] <= mask, this->chunks[3] <= mask)
+        .to_bitmask();
   }
 
-  simdutf_warn_unused size_t
-  convert_utf16be_to_utf8(const char16_t *buf, size_t len,
-                          char *utf8_output) const noexcept final override {
-    return set_best()->convert_utf16be_to_utf8(buf, len, utf8_output);
-  }
+  simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+    const simd16<T> mask_low = simd16<T>::splat(low);
+    const simd16<T> mask_high = simd16<T>::splat(high);
 
-  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
-      const char16_t *buf, size_t len,
-      char *utf8_output) const noexcept final override {
-    return set_best()->convert_utf16le_to_utf8_with_errors(buf, len,
-                                                           utf8_output);
+    return simd16x32<bool>(
+               (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+               (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+               (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+               (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+        .to_bitmask();
   }
-
-  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
-      const char16_t *buf, size_t len,
-      char *utf8_output) const noexcept final override {
-    return set_best()->convert_utf16be_to_utf8_with_errors(buf, len,
-                                                           utf8_output);
+  simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+    const simd16<T> mask_low = simd16<T>::splat(low);
+    const simd16<T> mask_high = simd16<T>::splat(high);
+    return simd16x32<bool>(
+               (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+               (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
+               (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
+               (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
+        .to_bitmask();
   }
-
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
-      const char16_t *buf, size_t len,
-      char *utf8_output) const noexcept final override {
-    return set_best()->convert_valid_utf16le_to_utf8(buf, len, utf8_output);
+  simdutf_really_inline uint64_t lt(const T m) const {
+    const simd16<T> mask = simd16<T>::splat(m);
+    return simd16x32<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
+                           this->chunks[2] < mask, this->chunks[3] < mask)
+        .to_bitmask();
   }
 
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
-      const char16_t *buf, size_t len,
-      char *utf8_output) const noexcept final override {
-    return set_best()->convert_valid_utf16be_to_utf8(buf, len, utf8_output);
-  }
+}; // struct simd16x32<T>
 
-  simdutf_warn_unused size_t
-  convert_utf32_to_latin1(const char32_t *buf, size_t len,
-                          char *latin1_output) const noexcept final override {
-    return set_best()->convert_utf32_to_latin1(buf, len, latin1_output);
-  }
+template <>
+simdutf_really_inline uint64_t simd16x32<uint16_t>::not_in_range(
+    const uint16_t low, const uint16_t high) const {
+  const simd16<uint16_t> mask_low = simd16<uint16_t>::splat(low);
+  const simd16<uint16_t> mask_high = simd16<uint16_t>::splat(high);
+  simd16x32<uint16_t> x(simd16<uint16_t>((this->chunks[0] > mask_high) |
+                                         (this->chunks[0] < mask_low)),
+                        simd16<uint16_t>((this->chunks[1] > mask_high) |
+                                         (this->chunks[1] < mask_low)),
+                        simd16<uint16_t>((this->chunks[2] > mask_high) |
+                                         (this->chunks[2] < mask_low)),
+                        simd16<uint16_t>((this->chunks[3] > mask_high) |
+                                         (this->chunks[3] < mask_low)));
+  return x.to_bitmask();
+}
+/* end file src/simdutf/lsx/simd16-inl.h */
+} // namespace simd
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
 
-  simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
-      const char32_t *buf, size_t len,
-      char *latin1_output) const noexcept final override {
-    return set_best()->convert_utf32_to_latin1_with_errors(buf, len,
-                                                           latin1_output);
-  }
+#endif // SIMDUTF_LSX_SIMD_H
+/* end file src/simdutf/lsx/simd.h */
 
-  simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
-      const char32_t *buf, size_t len,
-      char *latin1_output) const noexcept final override {
-    return set_best()->convert_utf32_to_latin1(buf, len, latin1_output);
-  }
+/* begin file src/simdutf/lsx/end.h */
+/* end file src/simdutf/lsx/end.h */
 
-  simdutf_warn_unused size_t
-  convert_utf32_to_utf8(const char32_t *buf, size_t len,
-                        char *utf8_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf8(buf, len, utf8_output);
-  }
+#endif // SIMDUTF_IMPLEMENTATION_LSX
 
-  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
-      const char32_t *buf, size_t len,
-      char *utf8_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
-  }
+#endif // SIMDUTF_LSX_H
+/* end file src/simdutf/lsx.h */
+/* begin file src/simdutf/lasx.h */
+#ifndef SIMDUTF_LASX_H
+#define SIMDUTF_LASX_H
 
-  simdutf_warn_unused size_t
-  convert_valid_utf32_to_utf8(const char32_t *buf, size_t len,
-                              char *utf8_output) const noexcept final override {
-    return set_best()->convert_valid_utf32_to_utf8(buf, len, utf8_output);
-  }
+#ifdef SIMDUTF_FALLBACK_H
+  #error "lasx.h must be included before fallback.h"
+#endif
 
-  simdutf_warn_unused size_t convert_utf32_to_utf16le(
-      const char32_t *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf16le(buf, len, utf16_output);
-  }
 
-  simdutf_warn_unused size_t convert_utf32_to_utf16be(
-      const char32_t *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf16be(buf, len, utf16_output);
-  }
+#ifndef SIMDUTF_IMPLEMENTATION_LASX
+  #define SIMDUTF_IMPLEMENTATION_LASX (SIMDUTF_IS_LASX)
+#endif
+#if SIMDUTF_IMPLEMENTATION_LASX && SIMDUTF_IS_LASX
+  #define SIMDUTF_CAN_ALWAYS_RUN_LASX 1
+#else
+  #define SIMDUTF_CAN_ALWAYS_RUN_LASX 0
+#endif
 
-  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
-      const char32_t *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf16le_with_errors(buf, len,
-                                                            utf16_output);
-  }
+#define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK (SIMDUTF_IMPLEMENTATION_FALLBACK)
 
-  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
-      const char32_t *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf16be_with_errors(buf, len,
-                                                            utf16_output);
-  }
+#if SIMDUTF_IMPLEMENTATION_LASX
 
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
-      const char32_t *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_valid_utf32_to_utf16le(buf, len, utf16_output);
-  }
+namespace simdutf {
+/**
+ * Implementation for LoongArch ASX.
+ */
+namespace lasx {} // namespace lasx
+} // namespace simdutf
 
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
-      const char32_t *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_valid_utf32_to_utf16be(buf, len, utf16_output);
-  }
+/* begin file src/simdutf/lasx/implementation.h */
+#ifndef SIMDUTF_LASX_IMPLEMENTATION_H
+#define SIMDUTF_LASX_IMPLEMENTATION_H
 
-  simdutf_warn_unused size_t convert_utf16le_to_utf32(
-      const char16_t *buf, size_t len,
-      char32_t *utf32_output) const noexcept final override {
-    return set_best()->convert_utf16le_to_utf32(buf, len, utf32_output);
-  }
 
-  simdutf_warn_unused size_t convert_utf16be_to_utf32(
-      const char16_t *buf, size_t len,
-      char32_t *utf32_output) const noexcept final override {
-    return set_best()->convert_utf16be_to_utf32(buf, len, utf32_output);
-  }
+namespace simdutf {
+namespace lasx {
+
+namespace {
+using namespace simdutf;
+}
 
+class implementation final : public simdutf::implementation {
+public:
+  simdutf_really_inline implementation()
+      : simdutf::implementation("lasx", "LOONGARCH ASX",
+                                internal::instruction_set::LSX |
+                                    internal::instruction_set::LASX) {}
+  simdutf_warn_unused int detect_encodings(const char *input,
+                                           size_t length) const noexcept final;
+  simdutf_warn_unused bool validate_utf8(const char *buf,
+                                         size_t len) const noexcept final;
+  simdutf_warn_unused result
+  validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_ascii(const char *buf,
+                                          size_t len) const noexcept final;
+  simdutf_warn_unused result
+  validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
+                                            size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
+                                            size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf16le_with_errors(
+      const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf16be_with_errors(
+      const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf32(const char32_t *buf,
+                                          size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf32_with_errors(
+      const char32_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf8(
+      const char *buf, size_t len, char *utf8_output) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf16le(
+      const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf16be(
+      const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf32(
+      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_latin1(
+      const char *buf, size_t len, char *latin1_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
+      const char *buf, size_t len, char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
+      const char *buf, size_t len, char *latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16le(
+      const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16be(
+      const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+      const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+      const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+      const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+      const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf32(
+      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+      const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf16le_to_latin1(const char16_t *buf, size_t len,
+                            char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf16be_to_latin1(const char16_t *buf, size_t len,
+                            char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
+      const char16_t *buf, size_t len,
+      char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
+      const char16_t *buf, size_t len,
+      char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len,
+                                  char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len,
+                                  char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_utf8(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_utf8(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf32_to_latin1(const char32_t *buf, size_t len,
+                          char *latin1_output) const noexcept final;
+  simdutf_warn_unused result
+  convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+                                      char *latin1_output) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
+                                char *latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf8(
+      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
+      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf32_to_utf16le(const char32_t *buf, size_t len,
+                           char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf32_to_utf16be(const char32_t *buf, size_t len,
+                           char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+      const char32_t *buf, size_t len,
+      char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+      const char32_t *buf, size_t len,
+      char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len,
+                                 char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len,
+                                 char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf16le_to_utf32(const char16_t *buf, size_t len,
+                           char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf16be_to_utf32(const char16_t *buf, size_t len,
+                           char32_t *utf32_buffer) const noexcept final;
   simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
       const char16_t *buf, size_t len,
-      char32_t *utf32_output) const noexcept final override {
-    return set_best()->convert_utf16le_to_utf32_with_errors(buf, len,
-                                                            utf32_output);
-  }
-
+      char32_t *utf32_buffer) const noexcept final;
   simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
       const char16_t *buf, size_t len,
-      char32_t *utf32_output) const noexcept final override {
-    return set_best()->convert_utf16be_to_utf32_with_errors(buf, len,
-                                                            utf32_output);
-  }
+      char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len,
+                                 char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len,
+                                 char32_t *utf32_buffer) const noexcept final;
+  void change_endianness_utf16(const char16_t *buf, size_t length,
+                               char16_t *output) const noexcept final;
+  simdutf_warn_unused size_t count_utf16le(const char16_t *buf,
+                                           size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf16be(const char16_t *buf,
+                                           size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf8(const char *buf,
+                                        size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf16le(
+      const char16_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf16be(
+      const char16_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf16_length_from_utf8(const char *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf32_length_from_utf8(const char *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  latin1_length_from_utf8(const char *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  latin1_length_from_utf16(size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  latin1_length_from_utf32(size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf32_length_from_latin1(size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf16_length_from_latin1(size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf8_length_from_latin1(const char *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(
+      const char *input, size_t length) const noexcept;
+  simdutf_warn_unused result
+  base64_to_binary(const char *input, size_t length, char *output,
+                   base64_options options) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(
+      const char16_t *input, size_t length) const noexcept;
+  simdutf_warn_unused result
+  base64_to_binary(const char16_t *input, size_t length, char *output,
+                   base64_options options) const noexcept;
+  simdutf_warn_unused size_t base64_length_from_binary(
+      size_t length, base64_options options) const noexcept;
+  size_t binary_to_base64(const char *input, size_t length, char *output,
+                          base64_options options) const noexcept;
 
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
-      const char16_t *buf, size_t len,
-      char32_t *utf32_output) const noexcept final override {
-    return set_best()->convert_valid_utf16le_to_utf32(buf, len, utf32_output);
-  }
+  simdutf_warn_unused virtual result
+  base64_to_binary(const char *input, size_t length, char *output,
+                   base64_options options,
+                   last_chunk_handling_options last_chunk_options =
+                       last_chunk_handling_options::loose) const noexcept;
+  simdutf_warn_unused virtual full_result base64_to_binary_details(
+      const char *input, size_t length, char *output, base64_options options,
+      last_chunk_handling_options last_chunk_options =
+          last_chunk_handling_options::loose) const noexcept;
+  simdutf_warn_unused virtual result
+  base64_to_binary(const char16_t *input, size_t length, char *output,
+                   base64_options options,
+                   last_chunk_handling_options last_chunk_options =
+                       last_chunk_handling_options::loose) const noexcept;
+  simdutf_warn_unused virtual full_result base64_to_binary_details(
+      const char16_t *input, size_t length, char *output,
+      base64_options options,
+      last_chunk_handling_options last_chunk_options =
+          last_chunk_handling_options::loose) const noexcept;
+};
 
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
-      const char16_t *buf, size_t len,
-      char32_t *utf32_output) const noexcept final override {
-    return set_best()->convert_valid_utf16be_to_utf32(buf, len, utf32_output);
-  }
+} // namespace lasx
+} // namespace simdutf
 
-  void change_endianness_utf16(const char16_t *buf, size_t len,
-                               char16_t *output) const noexcept final override {
-    set_best()->change_endianness_utf16(buf, len, output);
-  }
+#endif // SIMDUTF_LASX_IMPLEMENTATION_H
+/* end file src/simdutf/lasx/implementation.h */
 
-  simdutf_warn_unused size_t
-  count_utf16le(const char16_t *buf, size_t len) const noexcept final override {
-    return set_best()->count_utf16le(buf, len);
-  }
+/* begin file src/simdutf/lasx/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "lasx"
+// #define SIMDUTF_IMPLEMENTATION lasx
+/* end file src/simdutf/lasx/begin.h */
 
-  simdutf_warn_unused size_t
-  count_utf16be(const char16_t *buf, size_t len) const noexcept final override {
-    return set_best()->count_utf16be(buf, len);
-  }
+  // Declarations
+/* begin file src/simdutf/lasx/intrinsics.h */
+#ifndef SIMDUTF_LASX_INTRINSICS_H
+#define SIMDUTF_LASX_INTRINSICS_H
 
-  simdutf_warn_unused size_t
-  count_utf8(const char *buf, size_t len) const noexcept final override {
-    return set_best()->count_utf8(buf, len);
-  }
 
-  simdutf_warn_unused size_t
-  latin1_length_from_utf8(const char *buf, size_t len) const noexcept override {
-    return set_best()->latin1_length_from_utf8(buf, len);
-  }
+// This should be the correct header whether
+// you use visual studio or other compilers.
+#include <lsxintrin.h>
+#include <lasxintrin.h>
+
+#if defined(__loongarch_asx)
+  #ifdef __clang__
+    #define VREGS_PREFIX "$vr"
+    #define XREGS_PREFIX "$xr"
+  #else // GCC
+    #define VREGS_PREFIX "$f"
+    #define XREGS_PREFIX "$f"
+  #endif
+  #define __ALL_REGS                                                           \
+    "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,"  \
+    "27,28,29,30,31"
+// Convert __m128i to __m256i
+static inline __m256i ____m256i(__m128i in) {
+  __m256i out = __lasx_xvldi(0);
+  __asm__ volatile(".irp i," __ALL_REGS "\n\t"
+                   " .ifc %[out], " XREGS_PREFIX "\\i    \n\t"
+                   "  .irp j," __ALL_REGS "\n\t"
+                   "   .ifc %[in], " VREGS_PREFIX "\\j  \n\t"
+                   "    xvpermi.q $xr\\i, $xr\\j, 0x0  \n\t"
+                   "   .endif                           \n\t"
+                   "  .endr                             \n\t"
+                   " .endif                             \n\t"
+                   ".endr                               \n\t"
+                   : [out] "+f"(out)
+                   : [in] "f"(in));
+  return out;
+}
+// Convert two __m128i to __m256i
+static inline __m256i lasx_set_q(__m128i inhi, __m128i inlo) {
+  __m256i out;
+  __asm__ volatile(".irp i," __ALL_REGS "\n\t"
+                   " .ifc %[hi], " VREGS_PREFIX "\\i    \n\t"
+                   "  .irp j," __ALL_REGS "\n\t"
+                   "   .ifc %[lo], " VREGS_PREFIX "\\j  \n\t"
+                   "    xvpermi.q $xr\\i, $xr\\j, 0x20  \n\t"
+                   "   .endif                           \n\t"
+                   "  .endr                             \n\t"
+                   " .endif                             \n\t"
+                   ".endr                               \n\t"
+                   ".ifnc %[out], %[hi]                 \n\t"
+                   ".irp i," __ALL_REGS "\n\t"
+                   " .ifc %[out], " XREGS_PREFIX "\\i   \n\t"
+                   "  .irp j," __ALL_REGS "\n\t"
+                   "   .ifc %[hi], " VREGS_PREFIX "\\j  \n\t"
+                   "    xvori.b $xr\\i, $xr\\j, 0       \n\t"
+                   "   .endif                           \n\t"
+                   "  .endr                             \n\t"
+                   " .endif                             \n\t"
+                   ".endr                               \n\t"
+                   ".endif                              \n\t"
+                   : [out] "=f"(out), [hi] "+f"(inhi)
+                   : [lo] "f"(inlo));
+  return out;
+}
+// Convert __m256i low part to __m128i
+static inline __m128i lasx_extracti128_lo(__m256i in) {
+  __m128i out;
+  __asm__ volatile(".ifnc %[out], %[in]                 \n\t"
+                   ".irp i," __ALL_REGS "\n\t"
+                   " .ifc %[out], " VREGS_PREFIX "\\i   \n\t"
+                   "  .irp j," __ALL_REGS "\n\t"
+                   "   .ifc %[in], " XREGS_PREFIX "\\j  \n\t"
+                   "    vori.b $vr\\i, $vr\\j, 0        \n\t"
+                   "   .endif                           \n\t"
+                   "  .endr                             \n\t"
+                   " .endif                             \n\t"
+                   ".endr                               \n\t"
+                   ".endif                              \n\t"
+                   : [out] "=f"(out)
+                   : [in] "f"(in));
+  return out;
+}
+// Convert __m256i high part to __m128i
+static inline __m128i lasx_extracti128_hi(__m256i in) {
+  __m128i out;
+  __asm__ volatile(".irp i," __ALL_REGS "\n\t"
+                   " .ifc %[out], " VREGS_PREFIX "\\i   \n\t"
+                   "  .irp j," __ALL_REGS "\n\t"
+                   "   .ifc %[in], " XREGS_PREFIX "\\j  \n\t"
+                   "    xvpermi.q $xr\\i, $xr\\j, 0x11  \n\t"
+                   "   .endif                           \n\t"
+                   "  .endr                             \n\t"
+                   " .endif                             \n\t"
+                   ".endr                               \n\t"
+                   : [out] "=f"(out)
+                   : [in] "f"(in));
+  return out;
+}
+#endif
 
-  simdutf_warn_unused size_t
-  latin1_length_from_utf16(size_t len) const noexcept override {
-    return set_best()->latin1_length_from_utf16(len);
-  }
+#endif //  SIMDUTF_LASX_INTRINSICS_H
+/* end file src/simdutf/lasx/intrinsics.h */
+/* begin file src/simdutf/lasx/bitmanipulation.h */
+#ifndef SIMDUTF_LASX_BITMANIPULATION_H
+#define SIMDUTF_LASX_BITMANIPULATION_H
 
-  simdutf_warn_unused size_t
-  latin1_length_from_utf32(size_t len) const noexcept override {
-    return set_best()->latin1_length_from_utf32(len);
-  }
+#include <limits>
 
-  simdutf_warn_unused size_t
-  utf8_length_from_latin1(const char *buf, size_t len) const noexcept override {
-    return set_best()->utf8_length_from_latin1(buf, len);
-  }
+namespace simdutf {
+namespace lasx {
+namespace {
 
-  simdutf_warn_unused size_t utf8_length_from_utf16le(
-      const char16_t *buf, size_t len) const noexcept override {
-    return set_best()->utf8_length_from_utf16le(buf, len);
-  }
+simdutf_really_inline int count_ones(uint64_t input_num) {
+  return __lsx_vpickve2gr_w(__lsx_vpcnt_d(__lsx_vreplgr2vr_d(input_num)), 0);
+}
 
-  simdutf_warn_unused size_t utf8_length_from_utf16be(
-      const char16_t *buf, size_t len) const noexcept override {
-    return set_best()->utf8_length_from_utf16be(buf, len);
-  }
+#if SIMDUTF_NEED_TRAILING_ZEROES
+simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
+  return __builtin_ctzll(input_num);
+}
+#endif
 
-  simdutf_warn_unused size_t
-  utf16_length_from_latin1(size_t len) const noexcept override {
-    return set_best()->utf16_length_from_latin1(len);
-  }
+} // unnamed namespace
+} // namespace lasx
+} // namespace simdutf
 
-  simdutf_warn_unused size_t
-  utf32_length_from_latin1(size_t len) const noexcept override {
-    return set_best()->utf32_length_from_latin1(len);
-  }
+#endif // SIMDUTF_LASX_BITMANIPULATION_H
+/* end file src/simdutf/lasx/bitmanipulation.h */
+/* begin file src/simdutf/lasx/simd.h */
+#ifndef SIMDUTF_LASX_SIMD_H
+#define SIMDUTF_LASX_SIMD_H
 
-  simdutf_warn_unused size_t utf32_length_from_utf16le(
-      const char16_t *buf, size_t len) const noexcept override {
-    return set_best()->utf32_length_from_utf16le(buf, len);
-  }
+#include <type_traits>
 
-  simdutf_warn_unused size_t utf32_length_from_utf16be(
-      const char16_t *buf, size_t len) const noexcept override {
-    return set_best()->utf32_length_from_utf16be(buf, len);
-  }
+namespace simdutf {
+namespace lasx {
+namespace {
+namespace simd {
 
-  simdutf_warn_unused size_t
-  utf16_length_from_utf8(const char *buf, size_t len) const noexcept override {
-    return set_best()->utf16_length_from_utf8(buf, len);
-  }
+__attribute__((aligned(32))) static const uint8_t prev_shuf_table[32][32] = {
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+    {0,  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+     31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
+    {0,  0,  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+     30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13},
+    {0,  0,  0,  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+     29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
+    {0,  0,  0,  0,  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+     28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+    {0,  0,  0,  0,  0,  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+     27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+    {0,  0,  0,  0,  0,  0,  0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+     26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+    {0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3, 4, 5, 6, 7, 8,
+     25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8},
+    {0,  0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3, 4, 5, 6, 7,
+     24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7},
+    {0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3, 4, 5, 6,
+     23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6},
+    {0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3, 4, 5,
+     22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5},
+    {0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3, 4,
+     21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4},
+    {0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3,
+     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3},
+    {0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 1, 2,
+     19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2},
+    {0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 1,
+     18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1},
+    {0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0},
+    {15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+     15, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
+    {14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+     14, 15, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
+    {13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+     13, 14, 15, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
+    {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
+     12, 13, 14, 15, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
+    {11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
+     11, 12, 13, 14, 15, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
+    {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+     10, 11, 12, 13, 14, 15, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
+    {9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+     9, 10, 11, 12, 13, 14, 15, 0,  0,  0,  0,  0,  0,  0,  0,  0},
+    {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+     8, 9, 10, 11, 12, 13, 14, 15, 0,  0,  0,  0,  0,  0,  0,  0},
+    {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+     7, 8, 9, 10, 11, 12, 13, 14, 15, 0,  0,  0,  0,  0,  0,  0},
+    {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+     6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0,  0,  0,  0,  0,  0},
+    {5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+     5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0,  0,  0,  0,  0},
+    {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+     4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0,  0,  0,  0},
+    {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+     3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0,  0,  0},
+    {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+     2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0,  0},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0},
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+};
 
-  simdutf_warn_unused size_t utf8_length_from_utf32(
-      const char32_t *buf, size_t len) const noexcept override {
-    return set_best()->utf8_length_from_utf32(buf, len);
-  }
+__attribute__((aligned(32))) static const uint8_t bitsel_mask_table[32][32] = {
+    {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+    {0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+    {0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+    {0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0}};
 
-  simdutf_warn_unused size_t utf16_length_from_utf32(
-      const char32_t *buf, size_t len) const noexcept override {
-    return set_best()->utf16_length_from_utf32(buf, len);
-  }
+// Forward-declared so they can be used by splat and friends.
+template <typename Child> struct base {
+  __m256i value;
 
-  simdutf_warn_unused size_t
-  utf32_length_from_utf8(const char *buf, size_t len) const noexcept override {
-    return set_best()->utf32_length_from_utf8(buf, len);
-  }
+  // Zero constructor
+  simdutf_really_inline base() : value{__m256i()} {}
 
-  simdutf_warn_unused size_t maximal_binary_length_from_base64(
-      const char *input, size_t length) const noexcept override {
-    return set_best()->maximal_binary_length_from_base64(input, length);
+  // Conversion from SIMD register
+  simdutf_really_inline base(const __m256i _value) : value(_value) {}
+  // Conversion to SIMD register
+  simdutf_really_inline operator const __m256i &() const { return this->value; }
+  simdutf_really_inline operator __m256i &() { return this->value; }
+  template <endianness big_endian>
+  simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
+    if (big_endian) {
+      __m256i zero = __lasx_xvldi(0);
+      __m256i in8 = __lasx_xvpermi_d(this->value, 0b11011000);
+      __m256i inlow = __lasx_xvilvl_b(in8, zero);
+      __m256i inhigh = __lasx_xvilvh_b(in8, zero);
+      __lasx_xvst(inlow, reinterpret_cast<uint16_t *>(ptr), 0);
+      __lasx_xvst(inhigh, reinterpret_cast<uint16_t *>(ptr), 32);
+    } else {
+      __m256i inlow = __lasx_vext2xv_hu_bu(this->value);
+      __m256i inhigh = __lasx_vext2xv_hu_bu(
+          __lasx_xvpermi_q(this->value, this->value, 0b00000001));
+      __lasx_xvst(inlow, reinterpret_cast<__m256i *>(ptr), 0);
+      __lasx_xvst(inhigh, reinterpret_cast<__m256i *>(ptr), 32);
+    }
   }
+  simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
+    __m256i in32_0 = __lasx_vext2xv_wu_bu(this->value);
+    __lasx_xvst(in32_0, reinterpret_cast<uint32_t *>(ptr), 0);
 
-  simdutf_warn_unused result base64_to_binary(
-      const char *input, size_t length, char *output, base64_options options,
-      last_chunk_handling_options last_chunk_handling_options =
-          last_chunk_handling_options::loose) const noexcept override {
-    return set_best()->base64_to_binary(input, length, output, options,
-                                        last_chunk_handling_options);
-  }
+    __m256i in8_1 = __lasx_xvpermi_d(this->value, 0b00000001);
+    __m256i in32_1 = __lasx_vext2xv_wu_bu(in8_1);
+    __lasx_xvst(in32_1, reinterpret_cast<uint32_t *>(ptr), 32);
 
-  simdutf_warn_unused full_result base64_to_binary_details(
-      const char *input, size_t length, char *output, base64_options options,
-      last_chunk_handling_options last_chunk_handling_options =
-          last_chunk_handling_options::loose) const noexcept override {
-    return set_best()->base64_to_binary_details(input, length, output, options,
-                                                last_chunk_handling_options);
-  }
+    __m256i in8_2 = __lasx_xvpermi_d(this->value, 0b00000010);
+    __m256i in32_2 = __lasx_vext2xv_wu_bu(in8_2);
+    __lasx_xvst(in32_2, reinterpret_cast<uint32_t *>(ptr), 64);
 
-  simdutf_warn_unused size_t maximal_binary_length_from_base64(
-      const char16_t *input, size_t length) const noexcept override {
-    return set_best()->maximal_binary_length_from_base64(input, length);
+    __m256i in8_3 = __lasx_xvpermi_d(this->value, 0b00000011);
+    __m256i in32_3 = __lasx_vext2xv_wu_bu(in8_3);
+    __lasx_xvst(in32_3, reinterpret_cast<uint32_t *>(ptr), 96);
   }
-
-  simdutf_warn_unused result base64_to_binary(
-      const char16_t *input, size_t length, char *output,
-      base64_options options,
-      last_chunk_handling_options last_chunk_handling_options =
-          last_chunk_handling_options::loose) const noexcept override {
-    return set_best()->base64_to_binary(input, length, output, options,
-                                        last_chunk_handling_options);
+  // Bit operations
+  simdutf_really_inline Child operator|(const Child other) const {
+    return __lasx_xvor_v(this->value, other);
   }
-
-  simdutf_warn_unused full_result base64_to_binary_details(
-      const char16_t *input, size_t length, char *output,
-      base64_options options,
-      last_chunk_handling_options last_chunk_handling_options =
-          last_chunk_handling_options::loose) const noexcept override {
-    return set_best()->base64_to_binary_details(input, length, output, options,
-                                                last_chunk_handling_options);
+  simdutf_really_inline Child operator&(const Child other) const {
+    return __lasx_xvand_v(this->value, other);
   }
-
-  simdutf_warn_unused size_t base64_length_from_binary(
-      size_t length, base64_options options) const noexcept override {
-    return set_best()->base64_length_from_binary(length, options);
+  simdutf_really_inline Child operator^(const Child other) const {
+    return __lasx_xvxor_v(this->value, other);
   }
-
-  size_t binary_to_base64(const char *input, size_t length, char *output,
-                          base64_options options) const noexcept override {
-    return set_best()->binary_to_base64(input, length, output, options);
+  simdutf_really_inline Child bit_andnot(const Child other) const {
+    return __lasx_xvandn_v(this->value, other);
+  }
+  simdutf_really_inline Child &operator|=(const Child other) {
+    auto this_cast = static_cast<Child *>(this);
+    *this_cast = *this_cast | other;
+    return *this_cast;
+  }
+  simdutf_really_inline Child &operator&=(const Child other) {
+    auto this_cast = static_cast<Child *>(this);
+    *this_cast = *this_cast & other;
+    return *this_cast;
+  }
+  simdutf_really_inline Child &operator^=(const Child other) {
+    auto this_cast = static_cast<Child *>(this);
+    *this_cast = *this_cast ^ other;
+    return *this_cast;
   }
+};
 
-  simdutf_really_inline
-  detect_best_supported_implementation_on_first_use() noexcept
-      : implementation("best_supported_detector",
-                       "Detects the best supported implementation and sets it",
-                       0) {}
+template <typename T> struct simd8;
 
-private:
-  const implementation *set_best() const noexcept;
-};
+template <typename T, typename Mask = simd8<bool>>
+struct base8 : base<simd8<T>> {
+  typedef uint32_t bitmask_t;
+  typedef uint64_t bitmask2_t;
 
-static_assert(std::is_trivially_destructible<
-                  detect_best_supported_implementation_on_first_use>::value,
-              "detect_best_supported_implementation_on_first_use should be "
-              "trivially destructible");
+  simdutf_really_inline base8() : base<simd8<T>>() {}
+  simdutf_really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
+  simdutf_really_inline T first() const {
+    return __lasx_xvpickve2gr_wu(this->value, 0);
+  }
+  simdutf_really_inline T last() const {
+    return __lasx_xvpickve2gr_wu(this->value, 7);
+  }
+  friend simdutf_really_inline Mask operator==(const simd8<T> lhs,
+                                               const simd8<T> rhs) {
+    return __lasx_xvseq_b(lhs, rhs);
+  }
 
-static const std::initializer_list<const implementation *> &
-get_available_implementation_pointers() {
-  static const std::initializer_list<const implementation *>
-      available_implementation_pointers{
-#if SIMDUTF_IMPLEMENTATION_ICELAKE
-          get_icelake_singleton(),
-#endif
-#if SIMDUTF_IMPLEMENTATION_HASWELL
-          get_haswell_singleton(),
-#endif
-#if SIMDUTF_IMPLEMENTATION_WESTMERE
-          get_westmere_singleton(),
-#endif
-#if SIMDUTF_IMPLEMENTATION_ARM64
-          get_arm64_singleton(),
-#endif
-#if SIMDUTF_IMPLEMENTATION_PPC64
-          get_ppc64_singleton(),
-#endif
-#if SIMDUTF_IMPLEMENTATION_RVV
-          get_rvv_singleton(),
-#endif
-#if SIMDUTF_IMPLEMENTATION_FALLBACK
-          get_fallback_singleton(),
-#endif
-      }; // available_implementation_pointers
-  return available_implementation_pointers;
-}
+  static const int SIZE = sizeof(base<T>::value);
 
-// So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no
-// support
-class unsupported_implementation final : public implementation {
-public:
-  simdutf_warn_unused int detect_encodings(const char *,
-                                           size_t) const noexcept override {
-    return encoding_type::unspecified;
+  template <int N = 1>
+  simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
+    if (!N)
+      return this->value;
+
+    __m256i zero = __lasx_xvldi(0);
+    __m256i result, shuf;
+    if (N < 16) {
+      shuf = __lasx_xvld(prev_shuf_table[N], 0);
+
+      result = __lasx_xvshuf_b(
+          __lasx_xvpermi_q(this->value, this->value, 0b00000001), this->value,
+          shuf);
+      __m256i srl_prev = __lasx_xvbsrl_v(
+          __lasx_xvpermi_q(zero, prev_chunk.value, 0b00110001), (16 - N));
+      __m256i mask = __lasx_xvld(bitsel_mask_table[N], 0);
+      result = __lasx_xvbitsel_v(result, srl_prev, mask);
+
+      return result;
+    } else if (N == 16) {
+      return __lasx_xvpermi_q(this->value, prev_chunk.value, 0b00100001);
+    } /*else {
+      __m256i sll_value = __lasx_xvbsll_v(
+          __lasx_xvpermi_q(zero, this->value, 0b00000011), (N - 16) % 32);
+      __m256i mask = __lasx_xvld(bitsel_mask_table[N], 0);
+      shuf = __lasx_xvld(prev_shuf_table[N], 0);
+      result = __lasx_xvshuf_b(
+          __lasx_xvpermi_q(prev_chunk.value, prev_chunk.value, 0b00000001),
+          prev_chunk.value, shuf);
+      result = __lasx_xvbitsel_v(sll_value, result, mask);
+      return result;
+    }*/
   }
+};
 
-  simdutf_warn_unused bool validate_utf8(const char *,
-                                         size_t) const noexcept final override {
-    return false; // Just refuse to validate. Given that we have a fallback
-                  // implementation
-    // it seems unlikely that unsupported_implementation will ever be used. If
-    // it is used, then it will flag all strings as invalid. The alternative is
-    // to return an error_code from which the user has to figure out whether the
-    // string is valid UTF-8... which seems like a lot of work just to handle
-    // the very unlikely case that we have an unsupported implementation. And,
-    // when it does happen (that we have an unsupported implementation), what
-    // are the chances that the programmer has a fallback? Given that *we*
-    // provide the fallback, it implies that the programmer would need a
-    // fallback for our fallback.
+// SIMD byte mask type (returned by things like eq and gt)
+template <> struct simd8<bool> : base8<bool> {
+  static simdutf_really_inline simd8<bool> splat(bool _value) {
+    return __lasx_xvreplgr2vr_b(uint8_t(-(!!_value)));
   }
 
-  simdutf_warn_unused result validate_utf8_with_errors(
-      const char *, size_t) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+  simdutf_really_inline simd8() : base8() {}
+  simdutf_really_inline simd8(const __m256i _value) : base8<bool>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
 
-  simdutf_warn_unused bool
-  validate_ascii(const char *, size_t) const noexcept final override {
-    return false;
+  simdutf_really_inline uint32_t to_bitmask() const {
+    __m256i mask = __lasx_xvmsknz_b(this->value);
+    uint32_t mask0 = __lasx_xvpickve2gr_wu(mask, 0);
+    uint32_t mask1 = __lasx_xvpickve2gr_wu(mask, 4);
+    return (mask0 | (mask1 << 16));
   }
-
-  simdutf_warn_unused result validate_ascii_with_errors(
-      const char *, size_t) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  simdutf_really_inline bool any() const {
+    if (__lasx_xbz_b(this->value))
+      return false;
+    return true;
   }
-
-  simdutf_warn_unused bool
-  validate_utf16le(const char16_t *, size_t) const noexcept final override {
+  simdutf_really_inline bool none() const {
+    if (__lasx_xbz_b(this->value))
+      return true;
     return false;
   }
-
-  simdutf_warn_unused bool
-  validate_utf16be(const char16_t *, size_t) const noexcept final override {
+  simdutf_really_inline bool all() const {
+    if (__lasx_xbnz_b(this->value))
+      return true;
     return false;
   }
+  simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
+};
 
-  simdutf_warn_unused result validate_utf16le_with_errors(
-      const char16_t *, size_t) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
-
-  simdutf_warn_unused result validate_utf16be_with_errors(
-      const char16_t *, size_t) const noexcept final override {
-    return result(error_code::OTHER, 0);
+template <typename T> struct base8_numeric : base8<T> {
+  static simdutf_really_inline simd8<T> splat(T _value) {
+    return __lasx_xvreplgr2vr_b(_value);
   }
-
-  simdutf_warn_unused bool
-  validate_utf32(const char32_t *, size_t) const noexcept final override {
-    return false;
+  static simdutf_really_inline simd8<T> zero() { return __lasx_xvldi(0); }
+  static simdutf_really_inline simd8<T> load(const T values[32]) {
+    return __lasx_xvld(reinterpret_cast<const __m256i *>(values), 0);
   }
-
-  simdutf_warn_unused result validate_utf32_with_errors(
-      const char32_t *, size_t) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  // Repeat 16 values as many times as necessary (usually for lookup tables)
+  static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
+                                                  T v5, T v6, T v7, T v8, T v9,
+                                                  T v10, T v11, T v12, T v13,
+                                                  T v14, T v15) {
+    return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+                    v14, v15, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+                    v12, v13, v14, v15);
   }
 
-  simdutf_warn_unused size_t convert_latin1_to_utf8(
-      const char *, size_t, char *) const noexcept final override {
-    return 0;
-  }
+  simdutf_really_inline base8_numeric() : base8<T>() {}
+  simdutf_really_inline base8_numeric(const __m256i _value)
+      : base8<T>(_value) {}
 
-  simdutf_warn_unused size_t convert_latin1_to_utf16le(
-      const char *, size_t, char16_t *) const noexcept final override {
-    return 0;
+  // Store to array
+  simdutf_really_inline void store(T dst[32]) const {
+    return __lasx_xvst(this->value, reinterpret_cast<__m256i *>(dst), 0);
   }
 
-  simdutf_warn_unused size_t convert_latin1_to_utf16be(
-      const char *, size_t, char16_t *) const noexcept final override {
-    return 0;
+  // Addition/subtraction are the same for signed and unsigned
+  simdutf_really_inline simd8<T> operator+(const simd8<T> other) const {
+    return __lasx_xvadd_b(this->value, other);
   }
-
-  simdutf_warn_unused size_t convert_latin1_to_utf32(
-      const char *, size_t, char32_t *) const noexcept final override {
-    return 0;
+  simdutf_really_inline simd8<T> operator-(const simd8<T> other) const {
+    return __lasx_xvsub_b(this->value, other);
   }
-
-  simdutf_warn_unused size_t convert_utf8_to_latin1(
-      const char *, size_t, char *) const noexcept final override {
-    return 0;
+  simdutf_really_inline simd8<T> &operator+=(const simd8<T> other) {
+    *this = *this + other;
+    return *static_cast<simd8<T> *>(this);
   }
-
-  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
-      const char *, size_t, char *) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  simdutf_really_inline simd8<T> &operator-=(const simd8<T> other) {
+    *this = *this - other;
+    return *static_cast<simd8<T> *>(this);
   }
 
-  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
-      const char *, size_t, char *) const noexcept final override {
-    return 0;
-  }
+  // Override to distinguish from bool version
+  simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
 
-  simdutf_warn_unused size_t convert_utf8_to_utf16le(
-      const char *, size_t, char16_t *) const noexcept final override {
-    return 0;
+  // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
+  // for out of range values)
+  template <typename L>
+  simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+    __m256i origin = __lasx_xvand_v(this->value, __lasx_xvldi(0x1f));
+    return __lasx_xvshuf_b(__lasx_xvldi(0), lookup_table, origin);
   }
 
-  simdutf_warn_unused size_t convert_utf8_to_utf16be(
-      const char *, size_t, char16_t *) const noexcept final override {
-    return 0;
+  template <typename L>
+  simdutf_really_inline simd8<L>
+  lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
+            L replace5, L replace6, L replace7, L replace8, L replace9,
+            L replace10, L replace11, L replace12, L replace13, L replace14,
+            L replace15) const {
+    return lookup_16(simd8<L>::repeat_16(
+        replace0, replace1, replace2, replace3, replace4, replace5, replace6,
+        replace7, replace8, replace9, replace10, replace11, replace12,
+        replace13, replace14, replace15));
   }
+};
 
-  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
-      const char *, size_t, char16_t *) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+// Signed bytes
+template <> struct simd8<int8_t> : base8_numeric<int8_t> {
+  simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
+  simdutf_really_inline simd8(const __m256i _value)
+      : base8_numeric<int8_t>(_value) {}
 
-  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
-      const char *, size_t, char16_t *) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  // Splat constructor
+  simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
+  simdutf_really_inline operator simd8<uint8_t>() const;
+  // Member-by-member initialization
+  simdutf_really_inline
+  simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
+        int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+        int8_t v12, int8_t v13, int8_t v14, int8_t v15, int8_t v16, int8_t v17,
+        int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
+        int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29,
+        int8_t v30, int8_t v31)
+      : simd8((__m256i)v32i8{v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7,
+                             v8,  v9,  v10, v11, v12, v13, v14, v15,
+                             v16, v17, v18, v19, v20, v21, v22, v23,
+                             v24, v25, v26, v27, v28, v29, v30, v31}) {}
+  // Repeat 16 values as many times as necessary (usually for lookup tables)
+  simdutf_really_inline static simd8<int8_t>
+  repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
+            int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+            int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
+    return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+                         v13, v14, v15, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
+                         v10, v11, v12, v13, v14, v15);
   }
-
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
-      const char *, size_t, char16_t *) const noexcept final override {
-    return 0;
+  simdutf_really_inline bool is_ascii() const {
+    __m256i ascii_mask = __lasx_xvslti_b(this->value, 0);
+    if (__lasx_xbnz_v(ascii_mask))
+      return false;
+    return true;
   }
-
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
-      const char *, size_t, char16_t *) const noexcept final override {
-    return 0;
+  // Order-sensitive comparisons
+  simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const {
+    return __lasx_xvmax_b(this->value, other);
   }
-
-  simdutf_warn_unused size_t convert_utf8_to_utf32(
-      const char *, size_t, char32_t *) const noexcept final override {
-    return 0;
+  simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const {
+    return __lasx_xvmin_b(this->value, other);
   }
-
-  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
-      const char *, size_t, char32_t *) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const {
+    return __lasx_xvslt_b(other, this->value);
   }
-
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
-      const char *, size_t, char32_t *) const noexcept final override {
-    return 0;
+  simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const {
+    return __lasx_xvslt_b(this->value, other);
   }
+};
 
-  simdutf_warn_unused size_t convert_utf16le_to_latin1(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return 0;
+// Unsigned bytes
+template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
+  simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
+  simdutf_really_inline simd8(const __m256i _value)
+      : base8_numeric<uint8_t>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
+  // Member-by-member initialization
+  simdutf_really_inline
+  simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
+        uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
+        uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
+        uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20,
+        uint8_t v21, uint8_t v22, uint8_t v23, uint8_t v24, uint8_t v25,
+        uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30,
+        uint8_t v31)
+      : simd8((__m256i)v32u8{v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7,
+                             v8,  v9,  v10, v11, v12, v13, v14, v15,
+                             v16, v17, v18, v19, v20, v21, v22, v23,
+                             v24, v25, v26, v27, v28, v29, v30, v31}) {}
+  // Repeat 16 values as many times as necessary (usually for lookup tables)
+  simdutf_really_inline static simd8<uint8_t>
+  repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
+            uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
+            uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
+            uint8_t v15) {
+    return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+                          v13, v14, v15, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
+                          v10, v11, v12, v13, v14, v15);
   }
 
-  simdutf_warn_unused size_t convert_utf16be_to_latin1(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return 0;
+  // Saturated math
+  simdutf_really_inline simd8<uint8_t>
+  saturating_add(const simd8<uint8_t> other) const {
+    return __lasx_xvsadd_bu(this->value, other);
   }
-
-  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  simdutf_really_inline simd8<uint8_t>
+  saturating_sub(const simd8<uint8_t> other) const {
+    return __lasx_xvssub_bu(this->value, other);
   }
 
-  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  // Order-specific operations
+  simdutf_really_inline simd8<uint8_t>
+  max_val(const simd8<uint8_t> other) const {
+    return __lasx_xvmax_bu(*this, other);
   }
-
-  simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return 0;
+  simdutf_really_inline simd8<uint8_t>
+  min_val(const simd8<uint8_t> other) const {
+    return __lasx_xvmin_bu(*this, other);
   }
-
-  simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return 0;
+  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd8<uint8_t>
+  gt_bits(const simd8<uint8_t> other) const {
+    return this->saturating_sub(other);
   }
-
-  simdutf_warn_unused size_t convert_utf16le_to_utf8(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return 0;
+  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd8<uint8_t>
+  lt_bits(const simd8<uint8_t> other) const {
+    return other.saturating_sub(*this);
   }
-
-  simdutf_warn_unused size_t convert_utf16be_to_utf8(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return 0;
+  simdutf_really_inline simd8<bool>
+  operator<=(const simd8<uint8_t> other) const {
+    return __lasx_xvsle_bu(*this, other);
   }
-
-  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  simdutf_really_inline simd8<bool>
+  operator>=(const simd8<uint8_t> other) const {
+    return __lasx_xvsle_bu(other, *this);
   }
-
-  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  simdutf_really_inline simd8<bool>
+  operator>(const simd8<uint8_t> other) const {
+    return __lasx_xvslt_bu(*this, other);
   }
-
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return 0;
+  simdutf_really_inline simd8<bool>
+  operator<(const simd8<uint8_t> other) const {
+    return __lasx_xvslt_bu(other, *this);
   }
 
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return 0;
+  // Bit-specific operations
+  simdutf_really_inline simd8<bool> bits_not_set() const {
+    return *this == uint8_t(0);
   }
-
-  simdutf_warn_unused size_t convert_utf32_to_latin1(
-      const char32_t *, size_t, char *) const noexcept final override {
-    return 0;
+  simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const {
+    return (*this & bits).bits_not_set();
   }
-
-  simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
-      const char32_t *, size_t, char *) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  simdutf_really_inline simd8<bool> any_bits_set() const {
+    return ~this->bits_not_set();
   }
-
-  simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
-      const char32_t *, size_t, char *) const noexcept final override {
-    return 0;
+  simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
+    return ~this->bits_not_set(bits);
   }
-
-  simdutf_warn_unused size_t convert_utf32_to_utf8(
-      const char32_t *, size_t, char *) const noexcept final override {
-    return 0;
+  simdutf_really_inline bool is_ascii() const {
+    __m256i ascii_mask = __lasx_xvslti_b(this->value, 0);
+    if (__lasx_xbnz_v(ascii_mask))
+      return false;
+    return true;
   }
-
-  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
-      const char32_t *, size_t, char *) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  simdutf_really_inline bool any_bits_set_anywhere() const {
+    if (__lasx_xbnz_v(this->value))
+      return true;
+    return false;
   }
-
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
-      const char32_t *, size_t, char *) const noexcept final override {
-    return 0;
+  simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
+    return (*this & bits).any_bits_set_anywhere();
   }
-
-  simdutf_warn_unused size_t convert_utf32_to_utf16le(
-      const char32_t *, size_t, char16_t *) const noexcept final override {
-    return 0;
+  template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
+    return __lasx_xvsrli_b(this->value, N);
   }
-
-  simdutf_warn_unused size_t convert_utf32_to_utf16be(
-      const char32_t *, size_t, char16_t *) const noexcept final override {
-    return 0;
+  template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
+    return __lasx_xvslli_b(this->value, N);
   }
+};
+simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const {
+  return this->value;
+}
 
-  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
-      const char32_t *, size_t, char16_t *) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+template <typename T> struct simd8x64 {
+  static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+  static_assert(NUM_CHUNKS == 2,
+                "LASX kernel should use two registers per 64-byte block.");
+  simd8<T> chunks[NUM_CHUNKS];
 
-  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
-      const char32_t *, size_t, char16_t *) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+  simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
+  simd8x64<T> &
+  operator=(const simd8<T> other) = delete; // no assignment allowed
+  simd8x64() = delete;                      // no default constructor allowed
 
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
-      const char32_t *, size_t, char16_t *) const noexcept final override {
-    return 0;
-  }
+  simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1)
+      : chunks{chunk0, chunk1} {}
+  simdutf_really_inline simd8x64(const T *ptr)
+      : chunks{simd8<T>::load(ptr),
+               simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T))} {}
 
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
-      const char32_t *, size_t, char16_t *) const noexcept final override {
-    return 0;
+  simdutf_really_inline void store(T *ptr) const {
+    this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
+    this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
   }
 
-  simdutf_warn_unused size_t convert_utf16le_to_utf32(
-      const char16_t *, size_t, char32_t *) const noexcept final override {
-    return 0;
+  simdutf_really_inline uint64_t to_bitmask() const {
+    uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
+    uint64_t r_hi = this->chunks[1].to_bitmask();
+    return r_lo | (r_hi << 32);
   }
 
-  simdutf_warn_unused size_t convert_utf16be_to_utf32(
-      const char16_t *, size_t, char32_t *) const noexcept final override {
-    return 0;
+  simdutf_really_inline simd8x64<T> &operator|=(const simd8x64<T> &other) {
+    this->chunks[0] |= other.chunks[0];
+    this->chunks[1] |= other.chunks[1];
+    return *this;
   }
 
-  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
-      const char16_t *, size_t, char32_t *) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  simdutf_really_inline simd8<T> reduce_or() const {
+    return this->chunks[0] | this->chunks[1];
   }
 
-  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
-      const char16_t *, size_t, char32_t *) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  simdutf_really_inline bool is_ascii() const {
+    return this->reduce_or().is_ascii();
   }
 
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
-      const char16_t *, size_t, char32_t *) const noexcept final override {
-    return 0;
+  template <endianness endian>
+  simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
+    this->chunks[0].template store_ascii_as_utf16<endian>(ptr +
+                                                          sizeof(simd8<T>) * 0);
+    this->chunks[1].template store_ascii_as_utf16<endian>(ptr +
+                                                          sizeof(simd8<T>) * 1);
   }
 
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
-      const char16_t *, size_t, char32_t *) const noexcept final override {
-    return 0;
+  simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
+    this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 0);
+    this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 1);
   }
 
-  void change_endianness_utf16(const char16_t *, size_t,
-                               char16_t *) const noexcept final override {}
-
-  simdutf_warn_unused size_t
-  count_utf16le(const char16_t *, size_t) const noexcept final override {
-    return 0;
+  simdutf_really_inline simd8x64<T> bit_or(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<T>(this->chunks[0] | mask, this->chunks[1] | mask);
   }
 
-  simdutf_warn_unused size_t
-  count_utf16be(const char16_t *, size_t) const noexcept final override {
-    return 0;
+  simdutf_really_inline uint64_t eq(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask)
+        .to_bitmask();
   }
 
-  simdutf_warn_unused size_t count_utf8(const char *,
-                                        size_t) const noexcept final override {
-    return 0;
+  simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
+    return simd8x64<bool>(this->chunks[0] == other.chunks[0],
+                          this->chunks[1] == other.chunks[1])
+        .to_bitmask();
   }
 
-  simdutf_warn_unused size_t
-  latin1_length_from_utf8(const char *, size_t) const noexcept override {
-    return 0;
+  simdutf_really_inline uint64_t lteq(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask)
+        .to_bitmask();
   }
 
-  simdutf_warn_unused size_t
-  latin1_length_from_utf16(size_t) const noexcept override {
-    return 0;
+  simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+    const simd8<T> mask_low = simd8<T>::splat(low);
+    const simd8<T> mask_high = simd8<T>::splat(high);
+
+    return simd8x64<bool>(
+               (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+               (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low))
+        .to_bitmask();
+  }
+  simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+    const simd8<T> mask_low = simd8<T>::splat(low);
+    const simd8<T> mask_high = simd8<T>::splat(high);
+    return simd8x64<bool>(
+               (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+               (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low))
+        .to_bitmask();
+  }
+  simdutf_really_inline uint64_t lt(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask)
+        .to_bitmask();
   }
 
-  simdutf_warn_unused size_t
-  latin1_length_from_utf32(size_t) const noexcept override {
-    return 0;
+  simdutf_really_inline uint64_t gt(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] > mask, this->chunks[1] > mask)
+        .to_bitmask();
   }
-  simdutf_warn_unused size_t
-  utf8_length_from_latin1(const char *, size_t) const noexcept override {
-    return 0;
+  simdutf_really_inline uint64_t gteq(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] >= mask, this->chunks[1] >= mask)
+        .to_bitmask();
+  }
+  simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
+    const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+    return simd8x64<bool>((simd8<uint8_t>(__m256i(this->chunks[0])) >= mask),
+                          (simd8<uint8_t>(__m256i(this->chunks[1])) >= mask))
+        .to_bitmask();
   }
+}; // struct simd8x64<T>
 
-  simdutf_warn_unused size_t
-  utf8_length_from_utf16le(const char16_t *, size_t) const noexcept override {
-    return 0;
+/* begin file src/simdutf/lasx/simd16-inl.h */
+template <typename T> struct simd16;
+
+template <typename T, typename Mask = simd16<bool>>
+struct base16 : base<simd16<T>> {
+  using bitmask_type = uint32_t;
+
+  simdutf_really_inline base16() : base<simd16<T>>() {}
+  simdutf_really_inline base16(const __m256i _value)
+      : base<simd16<T>>(_value) {}
+  template <typename Pointer>
+  simdutf_really_inline base16(const Pointer *ptr)
+      : base16(__lasx_xvld(reinterpret_cast<const __m256i *>(ptr), 0)) {}
+  friend simdutf_really_inline Mask operator==(const simd16<T> lhs,
+                                               const simd16<T> rhs) {
+    return __lasx_xvseq_h(lhs.value, rhs.value);
   }
 
-  simdutf_warn_unused size_t
-  utf8_length_from_utf16be(const char16_t *, size_t) const noexcept override {
-    return 0;
+  /// the size of vector in bytes
+  static const int SIZE = sizeof(base<simd16<T>>::value);
+
+  /// the number of elements of type T a vector can hold
+  static const int ELEMENTS = SIZE / sizeof(T);
+
+  template <int N = 1>
+  simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
+    if (!N)
+      return this->value;
+
+    __m256i zero = __lasx_xvldi(0);
+    __m256i result, shuf;
+    if (N < 8) {
+      shuf = __lasx_xvld(prev_shuf_table[N * 2], 0);
+
+      result = __lasx_xvshuf_b(
+          __lasx_xvpermi_q(this->value, this->value, 0b00000001), this->value,
+          shuf);
+      __m256i srl_prev = __lasx_xvbsrl_v(
+          __lasx_xvpermi_q(zero, prev_chunk, 0b00110001), (16 - N * 2));
+      __m256i mask = __lasx_xvld(bitsel_mask_table[N], 0);
+      result = __lasx_xvbitsel_v(result, srl_prev, mask);
+
+      return result;
+    } else if (N == 8) {
+      return __lasx_xvpermi_q(this->value, prev_chunk, 0b00100001);
+    } else {
+      __m256i sll_value = __lasx_xvbsll_v(
+          __lasx_xvpermi_q(zero, this->value, 0b00000011), (N * 2 - 16));
+      __m256i mask = __lasx_xvld(bitsel_mask_table[N * 2], 0);
+      shuf = __lasx_xvld(prev_shuf_table[N * 2], 0);
+      result =
+          __lasx_xvshuf_b(__lasx_xvpermi_q(prev_chunk, prev_chunk, 0b00000001),
+                          prev_chunk, shuf);
+      result = __lasx_xvbitsel_v(sll_value, result, mask);
+      return result;
+    }
   }
+};
 
-  simdutf_warn_unused size_t
-  utf32_length_from_utf16le(const char16_t *, size_t) const noexcept override {
-    return 0;
+// SIMD byte mask type (returned by things like eq and gt)
+template <> struct simd16<bool> : base16<bool> {
+  static simdutf_really_inline simd16<bool> splat(bool _value) {
+    return __lasx_xvreplgr2vr_h(uint8_t(-(!!_value)));
   }
 
-  simdutf_warn_unused size_t
-  utf32_length_from_utf16be(const char16_t *, size_t) const noexcept override {
-    return 0;
+  simdutf_really_inline simd16() : base16() {}
+  simdutf_really_inline simd16(const __m256i _value) : base16<bool>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd16(bool _value) : base16<bool>(splat(_value)) {}
+
+  simdutf_really_inline bitmask_type to_bitmask() const {
+    __m256i mask = __lasx_xvmsknz_b(this->value);
+    bitmask_type mask0 = __lasx_xvpickve2gr_wu(mask, 0);
+    bitmask_type mask1 = __lasx_xvpickve2gr_wu(mask, 4);
+    return (mask0 | (mask1 << 16));
+  }
+  simdutf_really_inline bool any() const {
+    if (__lasx_xbz_v(this->value))
+      return false;
+    return true;
+  }
+  simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
+};
+
+template <typename T> struct base16_numeric : base16<T> {
+  static simdutf_really_inline simd16<T> splat(T _value) {
+    return __lasx_xvreplgr2vr_h((uint16_t)_value);
+  }
+  static simdutf_really_inline simd16<T> zero() { return __lasx_xvldi(0); }
+  static simdutf_really_inline simd16<T> load(const T values[8]) {
+    return __lasx_xvld(reinterpret_cast<const __m256i *>(values), 0);
   }
 
-  simdutf_warn_unused size_t
-  utf32_length_from_latin1(size_t) const noexcept override {
-    return 0;
+  simdutf_really_inline base16_numeric() : base16<T>() {}
+  simdutf_really_inline base16_numeric(const __m256i _value)
+      : base16<T>(_value) {}
+
+  // Store to array
+  simdutf_really_inline void store(T dst[8]) const {
+    return __lasx_xvst(this->value, reinterpret_cast<__m256i *>(dst), 0);
   }
 
-  simdutf_warn_unused size_t
-  utf16_length_from_utf8(const char *, size_t) const noexcept override {
-    return 0;
+  // Override to distinguish from bool version
+  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFFFu; }
+
+  // Addition/subtraction are the same for signed and unsigned
+  simdutf_really_inline simd16<T> operator+(const simd16<T> other) const {
+    return __lasx_xvadd_h(*this, other);
   }
-  simdutf_warn_unused size_t
-  utf16_length_from_latin1(size_t) const noexcept override {
-    return 0;
+  simdutf_really_inline simd16<T> operator-(const simd16<T> other) const {
+    return __lasx_xvsub_h(*this, other);
   }
-  simdutf_warn_unused size_t
-  utf8_length_from_utf32(const char32_t *, size_t) const noexcept override {
-    return 0;
+  simdutf_really_inline simd16<T> &operator+=(const simd16<T> other) {
+    *this = *this + other;
+    return *static_cast<simd16<T> *>(this);
+  }
+  simdutf_really_inline simd16<T> &operator-=(const simd16<T> other) {
+    *this = *this - other;
+    return *static_cast<simd16<T> *>(this);
   }
+};
 
-  simdutf_warn_unused size_t
-  utf16_length_from_utf32(const char32_t *, size_t) const noexcept override {
-    return 0;
+// Signed code units
+template <> struct simd16<int16_t> : base16_numeric<int16_t> {
+  simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
+  simdutf_really_inline simd16(const __m256i _value)
+      : base16_numeric<int16_t>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd16(const int16_t *values) : simd16(load(values)) {}
+  simdutf_really_inline simd16(const char16_t *values)
+      : simd16(load(reinterpret_cast<const int16_t *>(values))) {}
+  // Order-sensitive comparisons
+  simdutf_really_inline simd16<int16_t>
+  max_val(const simd16<int16_t> other) const {
+    return __lasx_xvmax_h(*this, other);
+  }
+  simdutf_really_inline simd16<int16_t>
+  min_val(const simd16<int16_t> other) const {
+    return __lasx_xvmin_h(*this, other);
+  }
+  simdutf_really_inline simd16<bool>
+  operator>(const simd16<int16_t> other) const {
+    return __lasx_xvsle_h(other.value, this->value);
+  }
+  simdutf_really_inline simd16<bool>
+  operator<(const simd16<int16_t> other) const {
+    return __lasx_xvslt_h(this->value, other.value);
   }
+};
 
-  simdutf_warn_unused size_t
-  utf32_length_from_utf8(const char *, size_t) const noexcept override {
-    return 0;
+// Unsigned code units
+template <> struct simd16<uint16_t> : base16_numeric<uint16_t> {
+  simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
+  simdutf_really_inline simd16(const __m256i _value)
+      : base16_numeric<uint16_t>(_value) {}
+
+  // Splat constructor
+  simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd16(const uint16_t *values) : simd16(load(values)) {}
+  simdutf_really_inline simd16(const char16_t *values)
+      : simd16(load(reinterpret_cast<const uint16_t *>(values))) {}
+
+  // Saturated math
+  simdutf_really_inline simd16<uint16_t>
+  saturating_add(const simd16<uint16_t> other) const {
+    return __lasx_xvsadd_hu(this->value, other.value);
+  }
+  simdutf_really_inline simd16<uint16_t>
+  saturating_sub(const simd16<uint16_t> other) const {
+    return __lasx_xvssub_hu(this->value, other.value);
   }
 
-  simdutf_warn_unused size_t maximal_binary_length_from_base64(
-      const char *, size_t) const noexcept override {
-    return 0;
+  // Order-specific operations
+  simdutf_really_inline simd16<uint16_t>
+  max_val(const simd16<uint16_t> other) const {
+    return __lasx_xvmax_hu(this->value, other.value);
+  }
+  simdutf_really_inline simd16<uint16_t>
+  min_val(const simd16<uint16_t> other) const {
+    return __lasx_xvmin_hu(this->value, other.value);
+  }
+  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd16<uint16_t>
+  gt_bits(const simd16<uint16_t> other) const {
+    return this->saturating_sub(other);
+  }
+  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd16<uint16_t>
+  lt_bits(const simd16<uint16_t> other) const {
+    return other.saturating_sub(*this);
+  }
+  simdutf_really_inline simd16<bool>
+  operator<=(const simd16<uint16_t> other) const {
+    return __lasx_xvsle_hu(this->value, other.value);
+  }
+  simdutf_really_inline simd16<bool>
+  operator>=(const simd16<uint16_t> other) const {
+    return __lasx_xvsle_hu(other.value, this->value);
+  }
+  simdutf_really_inline simd16<bool>
+  operator>(const simd16<uint16_t> other) const {
+    return __lasx_xvslt_hu(other.value, this->value);
+  }
+  simdutf_really_inline simd16<bool>
+  operator<(const simd16<uint16_t> other) const {
+    return __lasx_xvslt_hu(this->value, other.value);
   }
 
-  simdutf_warn_unused result
-  base64_to_binary(const char *, size_t, char *, base64_options,
-                   last_chunk_handling_options) const noexcept override {
-    return result(error_code::OTHER, 0);
+  // Bit-specific operations
+  simdutf_really_inline simd16<bool> bits_not_set() const {
+    return *this == uint16_t(0);
+  }
+  simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const {
+    return (*this & bits).bits_not_set();
+  }
+  simdutf_really_inline simd16<bool> any_bits_set() const {
+    return ~this->bits_not_set();
+  }
+  simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const {
+    return ~this->bits_not_set(bits);
   }
 
-  simdutf_warn_unused full_result base64_to_binary_details(
-      const char *, size_t, char *, base64_options,
-      last_chunk_handling_options) const noexcept override {
-    return full_result(error_code::OTHER, 0, 0);
+  simdutf_really_inline bool any_bits_set_anywhere() const {
+    if (__lasx_xbnz_v(this->value))
+      return true;
+    return false;
+  }
+  simdutf_really_inline bool
+  any_bits_set_anywhere(simd16<uint16_t> bits) const {
+    return (*this & bits).any_bits_set_anywhere();
   }
 
-  simdutf_warn_unused size_t maximal_binary_length_from_base64(
-      const char16_t *, size_t) const noexcept override {
-    return 0;
+  template <int N> simdutf_really_inline simd16<uint16_t> shr() const {
+    return simd16<uint16_t>(__lasx_xvsrli_h(this->value, N));
+  }
+  template <int N> simdutf_really_inline simd16<uint16_t> shl() const {
+    return simd16<uint16_t>(__lasx_xvslli_h(this->value, N));
   }
 
-  simdutf_warn_unused result
-  base64_to_binary(const char16_t *, size_t, char *, base64_options,
-                   last_chunk_handling_options) const noexcept override {
-    return result(error_code::OTHER, 0);
+  // Change the endianness
+  simdutf_really_inline simd16<uint16_t> swap_bytes() const {
+    return __lasx_xvshuf4i_b(this->value, 0b10110001);
   }
 
-  simdutf_warn_unused full_result base64_to_binary_details(
-      const char16_t *, size_t, char *, base64_options,
-      last_chunk_handling_options) const noexcept override {
-    return full_result(error_code::OTHER, 0, 0);
+  // Pack with the unsigned saturation of two uint16_t code units into single
+  // uint8_t vector
+  static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t> &v0,
+                                                   const simd16<uint16_t> &v1) {
+    return __lasx_xvpermi_d(__lasx_xvssrlni_bu_h(v1.value, v0.value, 0),
+                            0b11011000);
   }
+};
 
-  simdutf_warn_unused size_t
-  base64_length_from_binary(size_t, base64_options) const noexcept override {
-    return 0;
+template <typename T> struct simd16x32 {
+  static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
+  static_assert(NUM_CHUNKS == 2,
+                "LASX kernel should use two registers per 64-byte block.");
+  simd16<T> chunks[NUM_CHUNKS];
+
+  simd16x32(const simd16x32<T> &o) = delete; // no copy allowed
+  simd16x32<T> &
+  operator=(const simd16<T> other) = delete; // no assignment allowed
+  simd16x32() = delete;                      // no default constructor allowed
+
+  simdutf_really_inline simd16x32(const simd16<T> chunk0,
+                                  const simd16<T> chunk1)
+      : chunks{chunk0, chunk1} {}
+  simdutf_really_inline simd16x32(const T *ptr)
+      : chunks{simd16<T>::load(ptr),
+               simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T))} {}
+
+  simdutf_really_inline void store(T *ptr) const {
+    this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
+    this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
   }
 
-  size_t binary_to_base64(const char *, size_t, char *,
-                          base64_options) const noexcept override {
-    return 0;
+  simdutf_really_inline uint64_t to_bitmask() const {
+    uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
+    uint64_t r_hi = this->chunks[1].to_bitmask();
+    return r_lo | (r_hi << 32);
   }
 
-  unsupported_implementation()
-      : implementation("unsupported",
-                       "Unsupported CPU (no detected SIMD instructions)", 0) {}
-};
+  simdutf_really_inline simd16<T> reduce_or() const {
+    return this->chunks[0] | this->chunks[1];
+  }
 
-const unsupported_implementation *get_unsupported_singleton() {
-  static const unsupported_implementation unsupported_singleton{};
-  return &unsupported_singleton;
-}
-static_assert(std::is_trivially_destructible<unsupported_implementation>::value,
-              "unsupported_singleton should be trivially destructible");
+  simdutf_really_inline bool is_ascii() const {
+    return this->reduce_or().is_ascii();
+  }
 
-size_t available_implementation_list::size() const noexcept {
-  return internal::get_available_implementation_pointers().size();
-}
-const implementation *const *
-available_implementation_list::begin() const noexcept {
-  return internal::get_available_implementation_pointers().begin();
-}
-const implementation *const *
-available_implementation_list::end() const noexcept {
-  return internal::get_available_implementation_pointers().end();
-}
-const implementation *
-available_implementation_list::detect_best_supported() const noexcept {
-  // They are prelisted in priority order, so we just go down the list
-  uint32_t supported_instruction_sets =
-      internal::detect_supported_architectures();
-  for (const implementation *impl :
-       internal::get_available_implementation_pointers()) {
-    uint32_t required_instruction_sets = impl->required_instruction_sets();
-    if ((supported_instruction_sets & required_instruction_sets) ==
-        required_instruction_sets) {
-      return impl;
-    }
+  simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
+    this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
+    this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>));
   }
-  return get_unsupported_singleton(); // this should never happen?
-}
 
-const implementation *
-detect_best_supported_implementation_on_first_use::set_best() const noexcept {
-  SIMDUTF_PUSH_DISABLE_WARNINGS
-  SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC:
-                                     // manually verified this is safe
-      char *force_implementation_name = getenv("SIMDUTF_FORCE_IMPLEMENTATION");
-  SIMDUTF_POP_DISABLE_WARNINGS
+  simdutf_really_inline simd16x32<T> bit_or(const T m) const {
+    const simd16<T> mask = simd16<T>::splat(m);
+    return simd16x32<T>(this->chunks[0] | mask, this->chunks[1] | mask);
+  }
 
-  if (force_implementation_name) {
-    auto force_implementation =
-        get_available_implementations()[force_implementation_name];
-    if (force_implementation) {
-      return get_active_implementation() = force_implementation;
-    } else {
-      // Note: abort() and stderr usage within the library is forbidden.
-      return get_active_implementation() = get_unsupported_singleton();
-    }
+  simdutf_really_inline void swap_bytes() {
+    this->chunks[0] = this->chunks[0].swap_bytes();
+    this->chunks[1] = this->chunks[1].swap_bytes();
   }
-  return get_active_implementation() =
-             get_available_implementations().detect_best_supported();
-}
 
-} // namespace internal
+  simdutf_really_inline uint64_t eq(const T m) const {
+    const simd16<T> mask = simd16<T>::splat(m);
+    return simd16x32<bool>(this->chunks[0] == mask, this->chunks[1] == mask)
+        .to_bitmask();
+  }
 
-/**
- * The list of available implementations compiled into simdutf.
- */
-SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list &
-get_available_implementations() {
-  static const internal::available_implementation_list
-      available_implementations{};
-  return available_implementations;
-}
+  simdutf_really_inline uint64_t eq(const simd16x32<uint16_t> &other) const {
+    return simd16x32<bool>(this->chunks[0] == other.chunks[0],
+                           this->chunks[1] == other.chunks[1])
+        .to_bitmask();
+  }
+
+  simdutf_really_inline uint64_t lteq(const T m) const {
+    const simd16<T> mask = simd16<T>::splat(m);
+    return simd16x32<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask)
+        .to_bitmask();
+  }
+
+  simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+    const simd16<T> mask_low = simd16<T>::splat(low);
+    const simd16<T> mask_high = simd16<T>::splat(high);
+
+    return simd16x32<bool>(
+               (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+               (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low))
+        .to_bitmask();
+  }
+  simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+    const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low - 1));
+    const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high + 1));
+    return simd16x32<bool>(
+               (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
+               (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low))
+        .to_bitmask();
+  }
+  simdutf_really_inline uint64_t lt(const T m) const {
+    const simd16<T> mask = simd16<T>::splat(m);
+    return simd16x32<bool>(this->chunks[0] < mask, this->chunks[1] < mask)
+        .to_bitmask();
+  }
+}; // struct simd16x32<T>
+/* end file src/simdutf/lasx/simd16-inl.h */
+} // namespace simd
+} // unnamed namespace
+} // namespace lasx
+} // namespace simdutf
+
+#endif // SIMDUTF_LASX_SIMD_H
+/* end file src/simdutf/lasx/simd.h */
 
+/* begin file src/simdutf/lasx/end.h */
+/* end file src/simdutf/lasx/end.h */
+
+#endif // SIMDUTF_IMPLEMENTATION_LASX
+
+#endif // SIMDUTF_LASX_H
+/* end file src/simdutf/lasx.h */
+/* begin file src/simdutf/fallback.h */
+#ifndef SIMDUTF_FALLBACK_H
+#define SIMDUTF_FALLBACK_H
+
+
+// Note that fallback.h is always imported last.
+
+// Default Fallback to on unless a builtin implementation has already been
+// selected.
+#ifndef SIMDUTF_IMPLEMENTATION_FALLBACK
+  #if SIMDUTF_CAN_ALWAYS_RUN_ARM64 || SIMDUTF_CAN_ALWAYS_RUN_ICELAKE ||        \
+      SIMDUTF_CAN_ALWAYS_RUN_HASWELL || SIMDUTF_CAN_ALWAYS_RUN_WESTMERE ||     \
+      SIMDUTF_CAN_ALWAYS_RUN_PPC64 || SIMDUTF_CAN_ALWAYS_RUN_RVV ||            \
+      SIMDUTF_CAN_ALWAYS_RUN_LSX || SIMDUTF_CAN_ALWAYS_RUN_LASX
+    #define SIMDUTF_IMPLEMENTATION_FALLBACK 0
+  #else
+    #define SIMDUTF_IMPLEMENTATION_FALLBACK 1
+  #endif
+#endif
+
+#define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK (SIMDUTF_IMPLEMENTATION_FALLBACK)
+
+#if SIMDUTF_IMPLEMENTATION_FALLBACK
+
+namespace simdutf {
 /**
- * The active implementation.
+ * Fallback implementation (runs on any machine).
  */
-SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> &
-get_active_implementation() {
-#if SIMDUTF_SINGLE_IMPLEMENTATION
-  // skip runtime detection
-  static internal::atomic_ptr<const implementation> active_implementation{
-      internal::get_single_implementation()};
-  return active_implementation;
-#else
-  static const internal::detect_best_supported_implementation_on_first_use
-      detect_best_supported_implementation_on_first_use_singleton;
-  static internal::atomic_ptr<const implementation> active_implementation{
-      &detect_best_supported_implementation_on_first_use_singleton};
-  return active_implementation;
-#endif
-}
+namespace fallback {} // namespace fallback
+} // namespace simdutf
 
-#if SIMDUTF_SINGLE_IMPLEMENTATION
-const implementation *get_default_implementation() {
-  return internal::get_single_implementation();
-}
-#else
-internal::atomic_ptr<const implementation> &get_default_implementation() {
-  return get_active_implementation();
-}
-#endif
-#define SIMDUTF_GET_CURRENT_IMPLEMENTION
+/* begin file src/simdutf/fallback/implementation.h */
+#ifndef SIMDUTF_FALLBACK_IMPLEMENTATION_H
+#define SIMDUTF_FALLBACK_IMPLEMENTATION_H
 
-simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept {
-  return get_default_implementation()->validate_utf8(buf, len);
-}
-simdutf_warn_unused result validate_utf8_with_errors(const char *buf,
-                                                     size_t len) noexcept {
-  return get_default_implementation()->validate_utf8_with_errors(buf, len);
-}
-simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept {
-  return get_default_implementation()->validate_ascii(buf, len);
-}
-simdutf_warn_unused result validate_ascii_with_errors(const char *buf,
-                                                      size_t len) noexcept {
-  return get_default_implementation()->validate_ascii_with_errors(buf, len);
-}
-simdutf_warn_unused size_t convert_utf8_to_utf16(
-    const char *input, size_t length, char16_t *utf16_output) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf8_to_utf16be(input, length, utf16_output);
-#else
-  return convert_utf8_to_utf16le(input, length, utf16_output);
-#endif
-}
-simdutf_warn_unused size_t convert_latin1_to_utf8(const char *buf, size_t len,
-                                                  char *utf8_output) noexcept {
-  return get_default_implementation()->convert_latin1_to_utf8(buf, len,
-                                                              utf8_output);
-}
-simdutf_warn_unused size_t convert_latin1_to_utf16le(
-    const char *buf, size_t len, char16_t *utf16_output) noexcept {
-  return get_default_implementation()->convert_latin1_to_utf16le(buf, len,
-                                                                 utf16_output);
+
+namespace simdutf {
+namespace fallback {
+
+namespace {
+using namespace simdutf;
 }
-simdutf_warn_unused size_t convert_latin1_to_utf16be(
-    const char *buf, size_t len, char16_t *utf16_output) noexcept {
-  return get_default_implementation()->convert_latin1_to_utf16be(buf, len,
-                                                                 utf16_output);
-}
-simdutf_warn_unused size_t convert_latin1_to_utf32(
-    const char *buf, size_t len, char32_t *latin1_output) noexcept {
-  return get_default_implementation()->convert_latin1_to_utf32(buf, len,
-                                                               latin1_output);
+
+class implementation final : public simdutf::implementation {
+public:
+  simdutf_really_inline implementation()
+      : simdutf::implementation("fallback", "Generic fallback implementation",
+                                0) {}
+  simdutf_warn_unused int detect_encodings(const char *input,
+                                           size_t length) const noexcept final;
+  simdutf_warn_unused bool validate_utf8(const char *buf,
+                                         size_t len) const noexcept final;
+  simdutf_warn_unused result
+  validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_ascii(const char *buf,
+                                          size_t len) const noexcept final;
+  simdutf_warn_unused result
+  validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
+                                            size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
+                                            size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf16le_with_errors(
+      const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf16be_with_errors(
+      const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf32(const char32_t *buf,
+                                          size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf32_with_errors(
+      const char32_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf8(
+      const char *buf, size_t len, char *utf8_output) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf16le(
+      const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf16be(
+      const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf32(
+      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_latin1(
+      const char *buf, size_t len, char *latin1_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
+      const char *buf, size_t len, char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
+      const char *buf, size_t len, char *latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16le(
+      const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16be(
+      const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+      const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+      const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+      const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+      const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf32(
+      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+      const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf16le_to_latin1(const char16_t *buf, size_t len,
+                            char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf16be_to_latin1(const char16_t *buf, size_t len,
+                            char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
+      const char16_t *buf, size_t len,
+      char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
+      const char16_t *buf, size_t len,
+      char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len,
+                                  char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len,
+                                  char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_utf8(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_utf8(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf8(
+      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
+      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf32_to_latin1(const char32_t *buf, size_t len,
+                          char *latin1_output) const noexcept final;
+  simdutf_warn_unused result
+  convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+                                      char *latin1_output) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
+                                char *latin1_output) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf32_to_utf16le(const char32_t *buf, size_t len,
+                           char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf32_to_utf16be(const char32_t *buf, size_t len,
+                           char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+      const char32_t *buf, size_t len,
+      char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+      const char32_t *buf, size_t len,
+      char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len,
+                                 char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len,
+                                 char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf16le_to_utf32(const char16_t *buf, size_t len,
+                           char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf16be_to_utf32(const char16_t *buf, size_t len,
+                           char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
+      const char16_t *buf, size_t len,
+      char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
+      const char16_t *buf, size_t len,
+      char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len,
+                                 char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len,
+                                 char32_t *utf32_buffer) const noexcept final;
+  void change_endianness_utf16(const char16_t *buf, size_t length,
+                               char16_t *output) const noexcept final;
+  simdutf_warn_unused size_t count_utf16le(const char16_t *buf,
+                                           size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf16be(const char16_t *buf,
+                                           size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf8(const char *buf,
+                                        size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf16le(
+      const char16_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf16be(
+      const char16_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf16_length_from_utf8(const char *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf32_length_from_utf8(const char *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  latin1_length_from_utf8(const char *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  latin1_length_from_utf16(size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  latin1_length_from_utf32(size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf32_length_from_latin1(size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf16_length_from_latin1(size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf8_length_from_latin1(const char *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(
+      const char *input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(
+      const char *input, size_t length, char *output, base64_options options,
+      last_chunk_handling_options last_chunk_options) const noexcept;
+  simdutf_warn_unused full_result base64_to_binary_details(
+      const char *input, size_t length, char *output, base64_options options,
+      last_chunk_handling_options last_chunk_options =
+          last_chunk_handling_options::loose) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(
+      const char16_t *input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(
+      const char16_t *input, size_t length, char *output,
+      base64_options options,
+      last_chunk_handling_options last_chunk_options) const noexcept;
+  simdutf_warn_unused size_t base64_length_from_binary(
+      size_t length, base64_options options) const noexcept;
+  simdutf_warn_unused full_result base64_to_binary_details(
+      const char16_t *input, size_t length, char *output,
+      base64_options options,
+      last_chunk_handling_options last_chunk_options =
+          last_chunk_handling_options::loose) const noexcept;
+  size_t binary_to_base64(const char *input, size_t length, char *output,
+                          base64_options options) const noexcept;
+};
+} // namespace fallback
+} // namespace simdutf
+
+#endif // SIMDUTF_FALLBACK_IMPLEMENTATION_H
+/* end file src/simdutf/fallback/implementation.h */
+
+/* begin file src/simdutf/fallback/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "fallback"
+// #define SIMDUTF_IMPLEMENTATION fallback
+/* end file src/simdutf/fallback/begin.h */
+
+  // Declarations
+/* begin file src/simdutf/fallback/bitmanipulation.h */
+#ifndef SIMDUTF_FALLBACK_BITMANIPULATION_H
+#define SIMDUTF_FALLBACK_BITMANIPULATION_H
+
+#include <limits>
+
+namespace simdutf {
+namespace fallback {
+namespace {} // unnamed namespace
+} // namespace fallback
+} // namespace simdutf
+
+#endif // SIMDUTF_FALLBACK_BITMANIPULATION_H
+/* end file src/simdutf/fallback/bitmanipulation.h */
+
+/* begin file src/simdutf/fallback/end.h */
+/* end file src/simdutf/fallback/end.h */
+
+#endif // SIMDUTF_IMPLEMENTATION_FALLBACK
+#endif // SIMDUTF_FALLBACK_H
+/* end file src/simdutf/fallback.h */
+
+/* begin file src/scalar/utf8.h */
+#ifndef SIMDUTF_UTF8_H
+#define SIMDUTF_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8 {
+#if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_RVV
+// only used by the fallback kernel.
+// credit: based on code from Google Fuchsia (Apache Licensed)
+inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  uint64_t pos = 0;
+  uint32_t code_point = 0;
+  while (pos < len) {
+    // check of the next 16 bytes are ascii.
+    uint64_t next_pos = pos + 16;
+    if (next_pos <=
+        len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      std::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        pos = next_pos;
+        continue;
+      }
+    }
+    unsigned char byte = data[pos];
+
+    while (byte < 0b10000000) {
+      if (++pos == len) {
+        return true;
+      }
+      byte = data[pos];
+    }
+
+    if ((byte & 0b11100000) == 0b11000000) {
+      next_pos = pos + 2;
+      if (next_pos > len) {
+        return false;
+      }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return false;
+      }
+      // range check
+      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if ((code_point < 0x80) || (0x7ff < code_point)) {
+        return false;
+      }
+    } else if ((byte & 0b11110000) == 0b11100000) {
+      next_pos = pos + 3;
+      if (next_pos > len) {
+        return false;
+      }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return false;
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return false;
+      }
+      // range check
+      code_point = (byte & 0b00001111) << 12 |
+                   (data[pos + 1] & 0b00111111) << 6 |
+                   (data[pos + 2] & 0b00111111);
+      if ((code_point < 0x800) || (0xffff < code_point) ||
+          (0xd7ff < code_point && code_point < 0xe000)) {
+        return false;
+      }
+    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
+      next_pos = pos + 4;
+      if (next_pos > len) {
+        return false;
+      }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return false;
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return false;
+      }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+        return false;
+      }
+      // range check
+      code_point =
+          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
+          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff || 0x10ffff < code_point) {
+        return false;
+      }
+    } else {
+      // we may have a continuation
+      return false;
+    }
+    pos = next_pos;
+  }
+  return true;
 }
-simdutf_warn_unused size_t convert_utf8_to_latin1(
-    const char *buf, size_t len, char *latin1_output) noexcept {
-  return get_default_implementation()->convert_utf8_to_latin1(buf, len,
-                                                              latin1_output);
+#endif
+
+inline simdutf_warn_unused result validate_with_errors(const char *buf,
+                                                       size_t len) noexcept {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  uint32_t code_point = 0;
+  while (pos < len) {
+    // check of the next 16 bytes are ascii.
+    size_t next_pos = pos + 16;
+    if (next_pos <=
+        len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      std::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        pos = next_pos;
+        continue;
+      }
+    }
+    unsigned char byte = data[pos];
+
+    while (byte < 0b10000000) {
+      if (++pos == len) {
+        return result(error_code::SUCCESS, len);
+      }
+      byte = data[pos];
+    }
+
+    if ((byte & 0b11100000) == 0b11000000) {
+      next_pos = pos + 2;
+      if (next_pos > len) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      // range check
+      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if ((code_point < 0x80) || (0x7ff < code_point)) {
+        return result(error_code::OVERLONG, pos);
+      }
+    } else if ((byte & 0b11110000) == 0b11100000) {
+      next_pos = pos + 3;
+      if (next_pos > len) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      // range check
+      code_point = (byte & 0b00001111) << 12 |
+                   (data[pos + 1] & 0b00111111) << 6 |
+                   (data[pos + 2] & 0b00111111);
+      if ((code_point < 0x800) || (0xffff < code_point)) {
+        return result(error_code::OVERLONG, pos);
+      }
+      if (0xd7ff < code_point && code_point < 0xe000) {
+        return result(error_code::SURROGATE, pos);
+      }
+    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
+      next_pos = pos + 4;
+      if (next_pos > len) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      // range check
+      code_point =
+          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
+          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff) {
+        return result(error_code::OVERLONG, pos);
+      }
+      if (0x10ffff < code_point) {
+        return result(error_code::TOO_LARGE, pos);
+      }
+    } else {
+      // we either have too many continuation bytes or an invalid leading byte
+      if ((byte & 0b11000000) == 0b10000000) {
+        return result(error_code::TOO_LONG, pos);
+      } else {
+        return result(error_code::HEADER_BITS, pos);
+      }
+    }
+    pos = next_pos;
+  }
+  return result(error_code::SUCCESS, len);
 }
-simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
-    const char *buf, size_t len, char *latin1_output) noexcept {
-  return get_default_implementation()->convert_utf8_to_latin1_with_errors(
-      buf, len, latin1_output);
+
+// Finds the previous leading byte starting backward from buf and validates with
+// errors from there Used to pinpoint the location of an error when an invalid
+// chunk is detected We assume that the stream starts with a leading byte, and
+// to check that it is the case, we ask that you pass a pointer to the start of
+// the stream (start).
+inline simdutf_warn_unused result rewind_and_validate_with_errors(
+    const char *start, const char *buf, size_t len) noexcept {
+  // First check that we start with a leading byte
+  if ((*start & 0b11000000) == 0b10000000) {
+    return result(error_code::TOO_LONG, 0);
+  }
+  size_t extra_len{0};
+  // A leading byte cannot be further than 4 bytes away
+  for (int i = 0; i < 5; i++) {
+    unsigned char byte = *buf;
+    if ((byte & 0b11000000) != 0b10000000) {
+      break;
+    } else {
+      buf--;
+      extra_len++;
+    }
+  }
+
+  result res = validate_with_errors(buf, len + extra_len);
+  res.count -= extra_len;
+  return res;
 }
-simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
-    const char *buf, size_t len, char *latin1_output) noexcept {
-  return get_default_implementation()->convert_valid_utf8_to_latin1(
-      buf, len, latin1_output);
+
+inline size_t count_code_points(const char *buf, size_t len) {
+  const int8_t *p = reinterpret_cast<const int8_t *>(buf);
+  size_t counter{0};
+  for (size_t i = 0; i < len; i++) {
+    // -65 is 0b10111111, anything larger in two-complement's should start a new
+    // code point.
+    if (p[i] > -65) {
+      counter++;
+    }
+  }
+  return counter;
 }
-simdutf_warn_unused size_t convert_utf8_to_utf16le(
-    const char *input, size_t length, char16_t *utf16_output) noexcept {
-  return get_default_implementation()->convert_utf8_to_utf16le(input, length,
-                                                               utf16_output);
+
+inline size_t utf16_length_from_utf8(const char *buf, size_t len) {
+  const int8_t *p = reinterpret_cast<const int8_t *>(buf);
+  size_t counter{0};
+  for (size_t i = 0; i < len; i++) {
+    if (p[i] > -65) {
+      counter++;
+    }
+    if (uint8_t(p[i]) >= 240) {
+      counter++;
+    }
+  }
+  return counter;
 }
-simdutf_warn_unused size_t convert_utf8_to_utf16be(
-    const char *input, size_t length, char16_t *utf16_output) noexcept {
-  return get_default_implementation()->convert_utf8_to_utf16be(input, length,
-                                                               utf16_output);
+
+simdutf_warn_unused inline size_t trim_partial_utf8(const char *input,
+                                                    size_t length) {
+  if (length < 3) {
+    switch (length) {
+    case 2:
+      if (uint8_t(input[length - 1]) >= 0xc0) {
+        return length - 1;
+      } // 2-, 3- and 4-byte characters with only 1 byte left
+      if (uint8_t(input[length - 2]) >= 0xe0) {
+        return length - 2;
+      } // 3- and 4-byte characters with only 2 bytes left
+      return length;
+    case 1:
+      if (uint8_t(input[length - 1]) >= 0xc0) {
+        return length - 1;
+      } // 2-, 3- and 4-byte characters with only 1 byte left
+      return length;
+    case 0:
+      return length;
+    }
+  }
+  if (uint8_t(input[length - 1]) >= 0xc0) {
+    return length - 1;
+  } // 2-, 3- and 4-byte characters with only 1 byte left
+  if (uint8_t(input[length - 2]) >= 0xe0) {
+    return length - 2;
+  } // 3- and 4-byte characters with only 1 byte left
+  if (uint8_t(input[length - 3]) >= 0xf0) {
+    return length - 3;
+  } // 4-byte characters with only 3 bytes left
+  return length;
 }
-simdutf_warn_unused result convert_utf8_to_utf16_with_errors(
-    const char *input, size_t length, char16_t *utf16_output) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
-#else
-  return convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
+
+} // namespace utf8
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
 #endif
+/* end file src/scalar/utf8.h */
+/* begin file src/scalar/utf16.h */
+#ifndef SIMDUTF_UTF16_H
+#define SIMDUTF_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16 {
+
+inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word) {
+  return uint16_t((word >> 8) | (word << 8));
 }
-simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
-    const char *input, size_t length, char16_t *utf16_output) noexcept {
-  return get_default_implementation()->convert_utf8_to_utf16le_with_errors(
-      input, length, utf16_output);
+
+template <endianness big_endian>
+inline simdutf_warn_unused bool validate(const char16_t *buf,
+                                         size_t len) noexcept {
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  uint64_t pos = 0;
+  while (pos < len) {
+    uint16_t word =
+        !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
+    if ((word & 0xF800) == 0xD800) {
+      if (pos + 1 >= len) {
+        return false;
+      }
+      uint16_t diff = uint16_t(word - 0xD800);
+      if (diff > 0x3FF) {
+        return false;
+      }
+      uint16_t next_word =
+          !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      if (diff2 > 0x3FF) {
+        return false;
+      }
+      pos += 2;
+    } else {
+      pos++;
+    }
+  }
+  return true;
 }
-simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
-    const char *input, size_t length, char16_t *utf16_output) noexcept {
-  return get_default_implementation()->convert_utf8_to_utf16be_with_errors(
-      input, length, utf16_output);
+
+template <endianness big_endian>
+inline simdutf_warn_unused result validate_with_errors(const char16_t *buf,
+                                                       size_t len) noexcept {
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  while (pos < len) {
+    uint16_t word =
+        !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
+    if ((word & 0xF800) == 0xD800) {
+      if (pos + 1 >= len) {
+        return result(error_code::SURROGATE, pos);
+      }
+      uint16_t diff = uint16_t(word - 0xD800);
+      if (diff > 0x3FF) {
+        return result(error_code::SURROGATE, pos);
+      }
+      uint16_t next_word =
+          !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      if (diff2 > 0x3FF) {
+        return result(error_code::SURROGATE, pos);
+      }
+      pos += 2;
+    } else {
+      pos++;
+    }
+  }
+  return result(error_code::SUCCESS, pos);
 }
-simdutf_warn_unused size_t convert_utf8_to_utf32(
-    const char *input, size_t length, char32_t *utf32_output) noexcept {
-  return get_default_implementation()->convert_utf8_to_utf32(input, length,
-                                                             utf32_output);
+
+template <endianness big_endian>
+inline size_t count_code_points(const char16_t *buf, size_t len) {
+  // We are not BOM aware.
+  const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
+  size_t counter{0};
+  for (size_t i = 0; i < len; i++) {
+    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
+    counter += ((word & 0xFC00) != 0xDC00);
+  }
+  return counter;
 }
-simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
-    const char *input, size_t length, char32_t *utf32_output) noexcept {
-  return get_default_implementation()->convert_utf8_to_utf32_with_errors(
-      input, length, utf32_output);
+
+template <endianness big_endian>
+inline size_t utf8_length_from_utf16(const char16_t *buf, size_t len) {
+  // We are not BOM aware.
+  const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
+  size_t counter{0};
+  for (size_t i = 0; i < len; i++) {
+    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
+    counter++; // ASCII
+    counter += static_cast<size_t>(
+        word >
+        0x7F); // non-ASCII is at least 2 bytes, surrogates are 2*2 == 4 bytes
+    counter += static_cast<size_t>((word > 0x7FF && word <= 0xD7FF) ||
+                                   (word >= 0xE000)); // three-byte
+  }
+  return counter;
 }
-simdutf_warn_unused bool validate_utf16(const char16_t *buf,
-                                        size_t len) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return validate_utf16be(buf, len);
-#else
-  return validate_utf16le(buf, len);
-#endif
+
+template <endianness big_endian>
+inline size_t utf32_length_from_utf16(const char16_t *buf, size_t len) {
+  // We are not BOM aware.
+  const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
+  size_t counter{0};
+  for (size_t i = 0; i < len; i++) {
+    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
+    counter += ((word & 0xFC00) != 0xDC00);
+  }
+  return counter;
 }
-simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
-                                          size_t len) noexcept {
-  return get_default_implementation()->validate_utf16le(buf, len);
+
+inline size_t latin1_length_from_utf16(size_t len) { return len; }
+
+simdutf_really_inline void change_endianness_utf16(const char16_t *in,
+                                                   size_t size, char16_t *out) {
+  const uint16_t *input = reinterpret_cast<const uint16_t *>(in);
+  uint16_t *output = reinterpret_cast<uint16_t *>(out);
+  for (size_t i = 0; i < size; i++) {
+    *output++ = uint16_t(input[i] >> 8 | input[i] << 8);
+  }
 }
-simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
-                                          size_t len) noexcept {
-  return get_default_implementation()->validate_utf16be(buf, len);
+
+template <endianness big_endian>
+simdutf_warn_unused inline size_t trim_partial_utf16(const char16_t *input,
+                                                     size_t length) {
+  if (length <= 1) {
+    return length;
+  }
+  uint16_t last_word = uint16_t(input[length - 1]);
+  last_word = !match_system(big_endian) ? swap_bytes(last_word) : last_word;
+  length -= ((last_word & 0xFC00) == 0xD800);
+  return length;
 }
-simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf,
-                                                      size_t len) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return validate_utf16be_with_errors(buf, len);
-#else
-  return validate_utf16le_with_errors(buf, len);
+
+} // namespace utf16
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
 #endif
+/* end file src/scalar/utf16.h */
+/* begin file src/scalar/utf32.h */
+#ifndef SIMDUTF_UTF32_H
+#define SIMDUTF_UTF32_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32 {
+
+inline simdutf_warn_unused bool validate(const char32_t *buf,
+                                         size_t len) noexcept {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  uint64_t pos = 0;
+  for (; pos < len; pos++) {
+    uint32_t word = data[pos];
+    if (word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
+      return false;
+    }
+  }
+  return true;
 }
-simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf,
-                                                        size_t len) noexcept {
-  return get_default_implementation()->validate_utf16le_with_errors(buf, len);
-}
-simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf,
-                                                        size_t len) noexcept {
-  return get_default_implementation()->validate_utf16be_with_errors(buf, len);
-}
-simdutf_warn_unused bool validate_utf32(const char32_t *buf,
-                                        size_t len) noexcept {
-  return get_default_implementation()->validate_utf32(buf, len);
-}
-simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf,
-                                                      size_t len) noexcept {
-  return get_default_implementation()->validate_utf32_with_errors(buf, len);
+
+inline simdutf_warn_unused result validate_with_errors(const char32_t *buf,
+                                                       size_t len) noexcept {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  size_t pos = 0;
+  for (; pos < len; pos++) {
+    uint32_t word = data[pos];
+    if (word > 0x10FFFF) {
+      return result(error_code::TOO_LARGE, pos);
+    }
+    if (word >= 0xD800 && word <= 0xDFFF) {
+      return result(error_code::SURROGATE, pos);
+    }
+  }
+  return result(error_code::SUCCESS, pos);
 }
-simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
-    const char *input, size_t length, char16_t *utf16_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
-#else
-  return convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
-#endif
+
+inline size_t utf8_length_from_utf32(const char32_t *buf, size_t len) {
+  // We are not BOM aware.
+  const uint32_t *p = reinterpret_cast<const uint32_t *>(buf);
+  size_t counter{0};
+  for (size_t i = 0; i < len; i++) {
+    // credit: @ttsugriy  for the vectorizable approach
+    counter++;                                     // ASCII
+    counter += static_cast<size_t>(p[i] > 0x7F);   // two-byte
+    counter += static_cast<size_t>(p[i] > 0x7FF);  // three-byte
+    counter += static_cast<size_t>(p[i] > 0xFFFF); // four-bytes
+  }
+  return counter;
 }
-simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
-    const char *input, size_t length, char16_t *utf16_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf8_to_utf16le(
-      input, length, utf16_buffer);
+
+inline size_t utf16_length_from_utf32(const char32_t *buf, size_t len) {
+  // We are not BOM aware.
+  const uint32_t *p = reinterpret_cast<const uint32_t *>(buf);
+  size_t counter{0};
+  for (size_t i = 0; i < len; i++) {
+    counter++;                                     // non-surrogate word
+    counter += static_cast<size_t>(p[i] > 0xFFFF); // surrogate pair
+  }
+  return counter;
 }
-simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
-    const char *input, size_t length, char16_t *utf16_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf8_to_utf16be(
-      input, length, utf16_buffer);
+
+inline size_t latin1_length_from_utf32(size_t len) {
+  // We are not BOM aware.
+  return len; // a utf32 codepoint will always represent 1 latin1 character
 }
-simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
-    const char *input, size_t length, char32_t *utf32_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf8_to_utf32(
-      input, length, utf32_buffer);
+
+inline simdutf_warn_unused uint32_t swap_bytes(const uint32_t word) {
+  return ((word >> 24) & 0xff) |      // move byte 3 to byte 0
+         ((word << 8) & 0xff0000) |   // move byte 1 to byte 2
+         ((word >> 8) & 0xff00) |     // move byte 2 to byte 1
+         ((word << 24) & 0xff000000); // byte 0 to byte 3
 }
-simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t *buf,
-                                                 size_t len,
-                                                 char *utf8_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf16be_to_utf8(buf, len, utf8_buffer);
-#else
-  return convert_utf16le_to_utf8(buf, len, utf8_buffer);
+
+} // namespace utf32
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
 #endif
+/* end file src/scalar/utf32.h */
+/* begin file src/scalar/base64.h */
+#ifndef SIMDUTF_BASE64_H
+#define SIMDUTF_BASE64_H
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace base64 {
+
+// This function is not expected to be fast. Do not use in long loops.
+template <class char_type> bool is_ascii_white_space(char_type c) {
+  return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f';
 }
-simdutf_warn_unused size_t convert_utf16_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf16be_to_latin1(buf, len, latin1_buffer);
-#else
-  return convert_utf16le_to_latin1(buf, len, latin1_buffer);
-#endif
+
+template <class char_type> bool is_ascii_white_space_or_padding(char_type c) {
+  return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' ||
+         c == '=';
 }
-simdutf_warn_unused size_t convert_latin1_to_utf16(
-    const char *buf, size_t len, char16_t *utf16_output) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_latin1_to_utf16be(buf, len, utf16_output);
-#else
-  return convert_latin1_to_utf16le(buf, len, utf16_output);
-#endif
+
+template <class char_type> bool is_eight_byte(char_type c) {
+  if (sizeof(char_type) == 1) {
+    return true;
+  }
+  return uint8_t(c) == c;
 }
-simdutf_warn_unused size_t convert_utf16be_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
-  return get_default_implementation()->convert_utf16be_to_latin1(buf, len,
-                                                                 latin1_buffer);
+
+// Returns true upon success. The destination buffer must be large enough.
+// This functions assumes that the padding (=) has been removed.
+template <class char_type>
+full_result
+base64_tail_decode(char *dst, const char_type *src, size_t length,
+                   size_t padded_characters, // number of padding characters
+                                             // '=', typically 0, 1, 2.
+                   base64_options options,
+                   last_chunk_handling_options last_chunk_options) {
+  // This looks like 5 branches, but we expect the compiler to resolve this to a
+  // single branch:
+  const uint8_t *to_base64 = (options & base64_url)
+                                 ? tables::base64::to_base64_url_value
+                                 : tables::base64::to_base64_value;
+  const uint32_t *d0 = (options & base64_url)
+                           ? tables::base64::base64_url::d0
+                           : tables::base64::base64_default::d0;
+  const uint32_t *d1 = (options & base64_url)
+                           ? tables::base64::base64_url::d1
+                           : tables::base64::base64_default::d1;
+  const uint32_t *d2 = (options & base64_url)
+                           ? tables::base64::base64_url::d2
+                           : tables::base64::base64_default::d2;
+  const uint32_t *d3 = (options & base64_url)
+                           ? tables::base64::base64_url::d3
+                           : tables::base64::base64_default::d3;
+
+  const char_type *srcend = src + length;
+  const char_type *srcinit = src;
+  const char *dstinit = dst;
+
+  uint32_t x;
+  size_t idx;
+  uint8_t buffer[4];
+  while (true) {
+    while (src + 4 <= srcend && is_eight_byte(src[0]) &&
+           is_eight_byte(src[1]) && is_eight_byte(src[2]) &&
+           is_eight_byte(src[3]) &&
+           (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
+                d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
+      if (match_system(endianness::BIG)) {
+        x = scalar::utf32::swap_bytes(x);
+      }
+      std::memcpy(dst, &x, 3); // optimization opportunity: copy 4 bytes
+      dst += 3;
+      src += 4;
+    }
+    idx = 0;
+    // we need at least four characters.
+    while (idx < 4 && src < srcend) {
+      char_type c = *src;
+      uint8_t code = to_base64[uint8_t(c)];
+      buffer[idx] = uint8_t(code);
+      if (is_eight_byte(c) && code <= 63) {
+        idx++;
+      } else if (code > 64 || !scalar::base64::is_eight_byte(c)) {
+        return {INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+                size_t(dst - dstinit)};
+      } else {
+        // We have a space or a newline. We ignore it.
+      }
+      src++;
+    }
+    if (idx != 4) {
+      if (last_chunk_options == last_chunk_handling_options::strict &&
+          (idx != 1) && ((idx + padded_characters) & 3) != 0) {
+        // The partial chunk was at src - idx
+        return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
+                size_t(dst - dstinit)};
+      } else if (last_chunk_options ==
+                     last_chunk_handling_options::stop_before_partial &&
+                 (idx != 1) && ((idx + padded_characters) & 3) != 0) {
+        // Rewind src to before partial chunk
+        src -= idx;
+        return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
+      } else {
+        if (idx == 2) {
+          uint32_t triple =
+              (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6);
+          if ((last_chunk_options == last_chunk_handling_options::strict) &&
+              (triple & 0xffff)) {
+            return {BASE64_EXTRA_BITS, size_t(src - srcinit),
+                    size_t(dst - dstinit)};
+          }
+          if (match_system(endianness::BIG)) {
+            triple <<= 8;
+            std::memcpy(dst, &triple, 1);
+          } else {
+            triple = scalar::utf32::swap_bytes(triple);
+            triple >>= 8;
+            std::memcpy(dst, &triple, 1);
+          }
+          dst += 1;
+        } else if (idx == 3) {
+          uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) +
+                            (uint32_t(buffer[1]) << 2 * 6) +
+                            (uint32_t(buffer[2]) << 1 * 6);
+          if ((last_chunk_options == last_chunk_handling_options::strict) &&
+              (triple & 0xff)) {
+            return {BASE64_EXTRA_BITS, size_t(src - srcinit),
+                    size_t(dst - dstinit)};
+          }
+          if (match_system(endianness::BIG)) {
+            triple <<= 8;
+            std::memcpy(dst, &triple, 2);
+          } else {
+            triple = scalar::utf32::swap_bytes(triple);
+            triple >>= 8;
+            std::memcpy(dst, &triple, 2);
+          }
+          dst += 2;
+        } else if (idx == 1) {
+          return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
+                  size_t(dst - dstinit)};
+        }
+        return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
+      }
+    }
+
+    uint32_t triple =
+        (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6) +
+        (uint32_t(buffer[2]) << 1 * 6) + (uint32_t(buffer[3]) << 0 * 6);
+    if (match_system(endianness::BIG)) {
+      triple <<= 8;
+      std::memcpy(dst, &triple, 3);
+    } else {
+      triple = scalar::utf32::swap_bytes(triple);
+      triple >>= 8;
+      std::memcpy(dst, &triple, 3);
+    }
+    dst += 3;
+  }
 }
-simdutf_warn_unused size_t convert_utf16le_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
-  return get_default_implementation()->convert_utf16le_to_latin1(buf, len,
-                                                                 latin1_buffer);
-}
-simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf16be_to_latin1(
-      buf, len, latin1_buffer);
-}
-simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf16le_to_latin1(
-      buf, len, latin1_buffer);
-}
-simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
-    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
-  return get_default_implementation()->convert_utf16le_to_latin1_with_errors(
-      buf, len, latin1_buffer);
+
+// like base64_tail_decode, but it will not write past the end of the output
+// buffer. The outlen paramter is modified to reflect the number of bytes
+// written. This functions assumes that the padding (=) has been removed.
+template <class char_type>
+result base64_tail_decode_safe(
+    char *dst, size_t &outlen, const char_type *&srcr, size_t length,
+    size_t padded_characters, // number of padding characters '=', typically 0,
+                              // 1, 2.
+    base64_options options, last_chunk_handling_options last_chunk_options) {
+  const char_type *src = srcr;
+  if (length == 0) {
+    outlen = 0;
+    return {SUCCESS, 0};
+  }
+  // This looks like 5 branches, but we expect the compiler to resolve this to a
+  // single branch:
+  const uint8_t *to_base64 = (options & base64_url)
+                                 ? tables::base64::to_base64_url_value
+                                 : tables::base64::to_base64_value;
+  const uint32_t *d0 = (options & base64_url)
+                           ? tables::base64::base64_url::d0
+                           : tables::base64::base64_default::d0;
+  const uint32_t *d1 = (options & base64_url)
+                           ? tables::base64::base64_url::d1
+                           : tables::base64::base64_default::d1;
+  const uint32_t *d2 = (options & base64_url)
+                           ? tables::base64::base64_url::d2
+                           : tables::base64::base64_default::d2;
+  const uint32_t *d3 = (options & base64_url)
+                           ? tables::base64::base64_url::d3
+                           : tables::base64::base64_default::d3;
+
+  const char_type *srcend = src + length;
+  const char_type *srcinit = src;
+  const char *dstinit = dst;
+  const char *dstend = dst + outlen;
+
+  uint32_t x;
+  size_t idx;
+  uint8_t buffer[4];
+  while (true) {
+    while (src + 4 <= srcend && is_eight_byte(src[0]) &&
+           is_eight_byte(src[1]) && is_eight_byte(src[2]) &&
+           is_eight_byte(src[3]) &&
+           (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
+                d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
+      if (dstend - dst < 3) {
+        outlen = size_t(dst - dstinit);
+        srcr = src;
+        return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit)};
+      }
+      if (match_system(endianness::BIG)) {
+        x = scalar::utf32::swap_bytes(x);
+      }
+      std::memcpy(dst, &x, 3); // optimization opportunity: copy 4 bytes
+      dst += 3;
+      src += 4;
+    }
+    idx = 0;
+    const char_type *srccur = src;
+    // We need at least four characters.
+    while (idx < 4 && src < srcend) {
+      char_type c = *src;
+      uint8_t code = to_base64[uint8_t(c)];
+
+      buffer[idx] = uint8_t(code);
+      if (is_eight_byte(c) && code <= 63) {
+        idx++;
+      } else if (code > 64 || !scalar::base64::is_eight_byte(c)) {
+        outlen = size_t(dst - dstinit);
+        srcr = src;
+        return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+      } else {
+        // We have a space or a newline. We ignore it.
+      }
+      src++;
+    }
+    if (idx != 4) {
+      if (last_chunk_options == last_chunk_handling_options::strict &&
+          ((idx + padded_characters) & 3) != 0) {
+        outlen = size_t(dst - dstinit);
+        srcr = src;
+        return {BASE64_INPUT_REMAINDER, size_t(src - srcinit)};
+      } else if (last_chunk_options ==
+                     last_chunk_handling_options::stop_before_partial &&
+                 ((idx + padded_characters) & 3) != 0) {
+        // Rewind src to before partial chunk
+        srcr = srccur;
+        outlen = size_t(dst - dstinit);
+        return {SUCCESS, size_t(dst - dstinit)};
+      } else { // loose mode
+        if (idx == 0) {
+          // No data left; return success
+          outlen = size_t(dst - dstinit);
+          srcr = src;
+          return {SUCCESS, size_t(dst - dstinit)};
+        } else if (idx == 1) {
+          // Error: Incomplete chunk of length 1 is invalid in loose mode
+          outlen = size_t(dst - dstinit);
+          srcr = src;
+          return {BASE64_INPUT_REMAINDER, size_t(src - srcinit)};
+        } else if (idx == 2 || idx == 3) {
+          // Check if there's enough space in the destination buffer
+          size_t required_space = (idx == 2) ? 1 : 2;
+          if (size_t(dstend - dst) < required_space) {
+            outlen = size_t(dst - dstinit);
+            srcr = src;
+            return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
+          }
+          uint32_t triple = 0;
+          if (idx == 2) {
+            triple = (uint32_t(buffer[0]) << 18) + (uint32_t(buffer[1]) << 12);
+            if ((last_chunk_options == last_chunk_handling_options::strict) &&
+                (triple & 0xffff)) {
+              srcr = src;
+              return {BASE64_EXTRA_BITS, size_t(src - srcinit)};
+            }
+            // Extract the first byte
+            triple >>= 16;
+            dst[0] = static_cast<char>(triple & 0xFF);
+            dst += 1;
+          } else if (idx == 3) {
+            triple = (uint32_t(buffer[0]) << 18) + (uint32_t(buffer[1]) << 12) +
+                     (uint32_t(buffer[2]) << 6);
+            if ((last_chunk_options == last_chunk_handling_options::strict) &&
+                (triple & 0xff)) {
+              srcr = src;
+              return {BASE64_EXTRA_BITS, size_t(src - srcinit)};
+            }
+            // Extract the first two bytes
+            triple >>= 8;
+            dst[0] = static_cast<char>((triple >> 8) & 0xFF);
+            dst[1] = static_cast<char>(triple & 0xFF);
+            dst += 2;
+          }
+          outlen = size_t(dst - dstinit);
+          srcr = src;
+          return {SUCCESS, size_t(dst - dstinit)};
+        }
+      }
+    }
+
+    if (dstend - dst < 3) {
+      outlen = size_t(dst - dstinit);
+      srcr = src;
+      return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
+    }
+    uint32_t triple = (uint32_t(buffer[0]) << 18) +
+                      (uint32_t(buffer[1]) << 12) + (uint32_t(buffer[2]) << 6) +
+                      (uint32_t(buffer[3]));
+    if (match_system(endianness::BIG)) {
+      triple <<= 8;
+      std::memcpy(dst, &triple, 3);
+    } else {
+      triple = scalar::utf32::swap_bytes(triple);
+      triple >>= 8;
+      std::memcpy(dst, &triple, 3);
+    }
+    dst += 3;
+  }
 }
-simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
-    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
-  return get_default_implementation()->convert_utf16be_to_latin1_with_errors(
-      buf, len, latin1_buffer);
+
+// Returns the number of bytes written. The destination buffer must be large
+// enough. It will add padding (=) if needed.
+size_t tail_encode_base64(char *dst, const char *src, size_t srclen,
+                          base64_options options) {
+  // By default, we use padding if we are not using the URL variant.
+  // This is check with ((options & base64_url) == 0) which returns true if we
+  // are not using the URL variant. However, we also allow 'inversion' of the
+  // convention with the base64_reverse_padding option. If the
+  // base64_reverse_padding option is set, we use padding if we are using the
+  // URL variant, and we omit it if we are not using the URL variant. This is
+  // checked with
+  // ((options & base64_reverse_padding) == base64_reverse_padding).
+  bool use_padding =
+      ((options & base64_url) == 0) ^
+      ((options & base64_reverse_padding) == base64_reverse_padding);
+  // This looks like 3 branches, but we expect the compiler to resolve this to
+  // a single branch:
+  const char *e0 = (options & base64_url) ? tables::base64::base64_url::e0
+                                          : tables::base64::base64_default::e0;
+  const char *e1 = (options & base64_url) ? tables::base64::base64_url::e1
+                                          : tables::base64::base64_default::e1;
+  const char *e2 = (options & base64_url) ? tables::base64::base64_url::e2
+                                          : tables::base64::base64_default::e2;
+  char *out = dst;
+  size_t i = 0;
+  uint8_t t1, t2, t3;
+  for (; i + 2 < srclen; i += 3) {
+    t1 = uint8_t(src[i]);
+    t2 = uint8_t(src[i + 1]);
+    t3 = uint8_t(src[i + 2]);
+    *out++ = e0[t1];
+    *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
+    *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
+    *out++ = e2[t3];
+  }
+  switch (srclen - i) {
+  case 0:
+    break;
+  case 1:
+    t1 = uint8_t(src[i]);
+    *out++ = e0[t1];
+    *out++ = e1[(t1 & 0x03) << 4];
+    if (use_padding) {
+      *out++ = '=';
+      *out++ = '=';
+    }
+    break;
+  default: /* case 2 */
+    t1 = uint8_t(src[i]);
+    t2 = uint8_t(src[i + 1]);
+    *out++ = e0[t1];
+    *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
+    *out++ = e2[(t2 & 0x0F) << 2];
+    if (use_padding) {
+      *out++ = '=';
+    }
+  }
+  return (size_t)(out - dst);
 }
-simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *buf,
-                                                   size_t len,
-                                                   char *utf8_buffer) noexcept {
-  return get_default_implementation()->convert_utf16le_to_utf8(buf, len,
-                                                               utf8_buffer);
+
+template <class char_type>
+simdutf_warn_unused size_t maximal_binary_length_from_base64(
+    const char_type *input, size_t length) noexcept {
+  // We follow https://infra.spec.whatwg.org/#forgiving-base64-decode
+  size_t padding = 0;
+  if (length > 0) {
+    if (input[length - 1] == '=') {
+      padding++;
+      if (length > 1 && input[length - 2] == '=') {
+        padding++;
+      }
+    }
+  }
+  size_t actual_length = length - padding;
+  if (actual_length % 4 <= 1) {
+    return actual_length / 4 * 3;
+  }
+  // if we have a valid input, then the remainder must be 2 or 3 adding one or
+  // two extra bytes.
+  return actual_length / 4 * 3 + (actual_length % 4) - 1;
 }
-simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *buf,
-                                                   size_t len,
-                                                   char *utf8_buffer) noexcept {
-  return get_default_implementation()->convert_utf16be_to_utf8(buf, len,
-                                                               utf8_buffer);
+
+simdutf_warn_unused size_t
+base64_length_from_binary(size_t length, base64_options options) noexcept {
+  // By default, we use padding if we are not using the URL variant.
+  // This is check with ((options & base64_url) == 0) which returns true if we
+  // are not using the URL variant. However, we also allow 'inversion' of the
+  // convention with the base64_reverse_padding option. If the
+  // base64_reverse_padding option is set, we use padding if we are using the
+  // URL variant, and we omit it if we are not using the URL variant. This is
+  // checked with
+  // ((options & base64_reverse_padding) == base64_reverse_padding).
+  bool use_padding =
+      ((options & base64_url) == 0) ^
+      ((options & base64_reverse_padding) == base64_reverse_padding);
+  if (!use_padding) {
+    return length / 3 * 4 + ((length % 3) ? (length % 3) + 1 : 0);
+  }
+  return (length + 2) / 3 *
+         4; // We use padding to make the length a multiple of 4.
 }
-simdutf_warn_unused result convert_utf16_to_utf8_with_errors(
-    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
-#else
-  return convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
+
+} // namespace base64
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
 #endif
+/* end file src/scalar/base64.h */
+/* begin file src/scalar/latin1_to_utf8/latin1_to_utf8.h */
+#ifndef SIMDUTF_LATIN1_TO_UTF8_H
+#define SIMDUTF_LATIN1_TO_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace latin1_to_utf8 {
+
+inline size_t convert(const char *buf, size_t len, char *utf8_output) {
+  const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
+  size_t pos = 0;
+  size_t utf8_pos = 0;
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <=
+        len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 |
+                 v2}; // We are only interested in these bits: 1000 1000 1000
+                      // 1000, so it makes sense to concatenate everything
+      if ((v & 0x8080808080808080) ==
+          0) { // if NONE of these are set, e.g. all of them are zero, then
+               // everything is ASCII
+        size_t final_pos = pos + 16;
+        while (pos < final_pos) {
+          utf8_output[utf8_pos++] = char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+
+    unsigned char byte = data[pos];
+    if ((byte & 0x80) == 0) { // if ASCII
+      // will generate one UTF-8 bytes
+      utf8_output[utf8_pos++] = char(byte);
+      pos++;
+    } else {
+      // will generate two UTF-8 bytes
+      utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000);
+      utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000);
+      pos++;
+    }
+  }
+  return utf8_pos;
 }
-simdutf_warn_unused result convert_utf16_to_latin1_with_errors(
-    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf16be_to_latin1_with_errors(buf, len, latin1_buffer);
-#else
-  return convert_utf16le_to_latin1_with_errors(buf, len, latin1_buffer);
+
+inline size_t convert_safe(const char *buf, size_t len, char *utf8_output,
+                           size_t utf8_len) {
+  const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
+  size_t pos = 0;
+  size_t skip_pos = 0;
+  size_t utf8_pos = 0;
+  while (pos < len && utf8_pos < utf8_len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos >= skip_pos && pos + 16 <= len &&
+        utf8_pos + 16 <= utf8_len) { // if it is safe to read 16 more bytes,
+                                     // check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 |
+                 v2}; // We are only interested in these bits: 1000 1000 1000
+                      // 1000, so it makes sense to concatenate everything
+      if ((v & 0x8080808080808080) ==
+          0) { // if NONE of these are set, e.g. all of them are zero, then
+               // everything is ASCII
+        ::memcpy(utf8_output + utf8_pos, buf + pos, 16);
+        utf8_pos += 16;
+        pos += 16;
+      } else {
+        // At least one of the next 16 bytes are not ASCII, we will process them
+        // one by one
+        skip_pos = pos + 16;
+      }
+    } else {
+      const auto byte = data[pos];
+      if ((byte & 0x80) == 0) { // if ASCII
+        // will generate one UTF-8 bytes
+        utf8_output[utf8_pos++] = char(byte);
+        pos++;
+      } else if (utf8_pos + 2 <= utf8_len) {
+        // will generate two UTF-8 bytes
+        utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000);
+        utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000);
+        pos++;
+      } else {
+        break;
+      }
+    }
+  }
+  return utf8_pos;
+}
+
+} // namespace latin1_to_utf8
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
 #endif
+/* end file src/scalar/latin1_to_utf8/latin1_to_utf8.h */
+
+namespace simdutf {
+bool implementation::supported_by_runtime_system() const {
+  uint32_t required_instruction_sets = this->required_instruction_sets();
+  uint32_t supported_instruction_sets =
+      internal::detect_supported_architectures();
+  return ((supported_instruction_sets & required_instruction_sets) ==
+          required_instruction_sets);
 }
-simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
-    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
-  return get_default_implementation()->convert_utf16le_to_utf8_with_errors(
-      buf, len, utf8_buffer);
+
+simdutf_warn_unused encoding_type implementation::autodetect_encoding(
+    const char *input, size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  if (bom_encoding != encoding_type::unspecified) {
+    return bom_encoding;
+  }
+  // UTF8 is common, it includes ASCII, and is commonly represented
+  // without a BOM, so if it fits, go with that. Note that it is still
+  // possible to get it wrong, we are only 'guessing'. If some has UTF-16
+  // data without a BOM, it could pass as UTF-8.
+  //
+  // An interesting twist might be to check for UTF-16 ASCII first (every
+  // other byte is zero).
+  if (validate_utf8(input, length)) {
+    return encoding_type::UTF8;
+  }
+  // The next most common encoding that might appear without BOM is probably
+  // UTF-16LE, so try that next.
+  if ((length % 2) == 0) {
+    // important: we need to divide by two
+    if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
+                         length / 2)) {
+      return encoding_type::UTF16_LE;
+    }
+  }
+  if ((length % 4) == 0) {
+    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+      return encoding_type::UTF32_LE;
+    }
+  }
+  return encoding_type::unspecified;
 }
-simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
-    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
-  return get_default_implementation()->convert_utf16be_to_utf8_with_errors(
-      buf, len, utf8_buffer);
+
+namespace internal {
+// When there is a single implementation, we should not pay a price
+// for dispatching to the best implementation. We should just use the
+// one we have. This is a compile-time check.
+#define SIMDUTF_SINGLE_IMPLEMENTATION                                          \
+  (SIMDUTF_IMPLEMENTATION_ICELAKE + SIMDUTF_IMPLEMENTATION_HASWELL +           \
+       SIMDUTF_IMPLEMENTATION_WESTMERE + SIMDUTF_IMPLEMENTATION_ARM64 +        \
+       SIMDUTF_IMPLEMENTATION_PPC64 + SIMDUTF_IMPLEMENTATION_LSX +             \
+       SIMDUTF_IMPLEMENTATION_LASX + SIMDUTF_IMPLEMENTATION_FALLBACK ==        \
+   1)
+
+// Static array of known implementations. We are hoping these get baked into the
+// executable without requiring a static initializer.
+
+#if SIMDUTF_IMPLEMENTATION_ICELAKE
+static const icelake::implementation *get_icelake_singleton() {
+  static const icelake::implementation icelake_singleton{};
+  return &icelake_singleton;
 }
-simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
-#else
-  return convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
 #endif
+#if SIMDUTF_IMPLEMENTATION_HASWELL
+static const haswell::implementation *get_haswell_singleton() {
+  static const haswell::implementation haswell_singleton{};
+  return &haswell_singleton;
 }
-simdutf_warn_unused size_t convert_valid_utf16_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_valid_utf16be_to_latin1(buf, len, latin1_buffer);
-#else
-  return convert_valid_utf16le_to_latin1(buf, len, latin1_buffer);
 #endif
+#if SIMDUTF_IMPLEMENTATION_WESTMERE
+static const westmere::implementation *get_westmere_singleton() {
+  static const westmere::implementation westmere_singleton{};
+  return &westmere_singleton;
 }
-simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf16le_to_utf8(
-      buf, len, utf8_buffer);
+#endif
+#if SIMDUTF_IMPLEMENTATION_ARM64
+static const arm64::implementation *get_arm64_singleton() {
+  static const arm64::implementation arm64_singleton{};
+  return &arm64_singleton;
 }
-simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf16be_to_utf8(
-      buf, len, utf8_buffer);
-}
-simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t *buf,
-                                                 size_t len,
-                                                 char *utf8_buffer) noexcept {
-  return get_default_implementation()->convert_utf32_to_utf8(buf, len,
-                                                             utf8_buffer);
-}
-simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
-    const char32_t *buf, size_t len, char *utf8_buffer) noexcept {
-  return get_default_implementation()->convert_utf32_to_utf8_with_errors(
-      buf, len, utf8_buffer);
-}
-simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
-    const char32_t *buf, size_t len, char *utf8_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf32_to_utf8(buf, len,
-                                                                   utf8_buffer);
-}
-simdutf_warn_unused size_t convert_utf32_to_utf16(
-    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf32_to_utf16be(buf, len, utf16_buffer);
-#else
-  return convert_utf32_to_utf16le(buf, len, utf16_buffer);
-#endif
-}
-simdutf_warn_unused size_t convert_utf32_to_latin1(
-    const char32_t *input, size_t length, char *latin1_output) noexcept {
-  return get_default_implementation()->convert_utf32_to_latin1(input, length,
-                                                               latin1_output);
-}
-simdutf_warn_unused size_t convert_utf32_to_utf16le(
-    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
-  return get_default_implementation()->convert_utf32_to_utf16le(buf, len,
-                                                                utf16_buffer);
-}
-simdutf_warn_unused size_t convert_utf32_to_utf16be(
-    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
-  return get_default_implementation()->convert_utf32_to_utf16be(buf, len,
-                                                                utf16_buffer);
-}
-simdutf_warn_unused result convert_utf32_to_utf16_with_errors(
-    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
-#else
-  return convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
-#endif
-}
-simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
-    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
-  return get_default_implementation()->convert_utf32_to_utf16le_with_errors(
-      buf, len, utf16_buffer);
-}
-simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
-    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
-  return get_default_implementation()->convert_utf32_to_utf16be_with_errors(
-      buf, len, utf16_buffer);
-}
-simdutf_warn_unused size_t convert_valid_utf32_to_utf16(
-    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
-#else
-  return convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
 #endif
+#if SIMDUTF_IMPLEMENTATION_PPC64
+static const ppc64::implementation *get_ppc64_singleton() {
+  static const ppc64::implementation ppc64_singleton{};
+  return &ppc64_singleton;
 }
-simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
-    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf32_to_utf16le(
-      buf, len, utf16_buffer);
-}
-simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
-    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf32_to_utf16be(
-      buf, len, utf16_buffer);
-}
-simdutf_warn_unused size_t convert_utf16_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf16be_to_utf32(buf, len, utf32_buffer);
-#else
-  return convert_utf16le_to_utf32(buf, len, utf32_buffer);
 #endif
+#if SIMDUTF_IMPLEMENTATION_RVV
+static const rvv::implementation *get_rvv_singleton() {
+  static const rvv::implementation rvv_singleton{};
+  return &rvv_singleton;
 }
-simdutf_warn_unused size_t convert_utf16le_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
-  return get_default_implementation()->convert_utf16le_to_utf32(buf, len,
-                                                                utf32_buffer);
-}
-simdutf_warn_unused size_t convert_utf16be_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
-  return get_default_implementation()->convert_utf16be_to_utf32(buf, len,
-                                                                utf32_buffer);
-}
-simdutf_warn_unused result convert_utf16_to_utf32_with_errors(
-    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
-#else
-  return convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
 #endif
+#if SIMDUTF_IMPLEMENTATION_LSX
+static const lsx::implementation *get_lsx_singleton() {
+  static const lsx::implementation lsx_singleton{};
+  return &lsx_singleton;
 }
-simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
-    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
-  return get_default_implementation()->convert_utf16le_to_utf32_with_errors(
-      buf, len, utf32_buffer);
-}
-simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
-    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
-  return get_default_implementation()->convert_utf16be_to_utf32_with_errors(
-      buf, len, utf32_buffer);
-}
-simdutf_warn_unused size_t convert_valid_utf16_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
-#else
-  return convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
 #endif
+#if SIMDUTF_IMPLEMENTATION_LASX
+static const lasx::implementation *get_lasx_singleton() {
+  static const lasx::implementation lasx_singleton{};
+  return &lasx_singleton;
 }
-simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf16le_to_utf32(
-      buf, len, utf32_buffer);
-}
-simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf16be_to_utf32(
-      buf, len, utf32_buffer);
-}
-void change_endianness_utf16(const char16_t *input, size_t length,
-                             char16_t *output) noexcept {
-  get_default_implementation()->change_endianness_utf16(input, length, output);
-}
-simdutf_warn_unused size_t count_utf16(const char16_t *input,
-                                       size_t length) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return count_utf16be(input, length);
-#else
-  return count_utf16le(input, length);
 #endif
+#if SIMDUTF_IMPLEMENTATION_FALLBACK
+static const fallback::implementation *get_fallback_singleton() {
+  static const fallback::implementation fallback_singleton{};
+  return &fallback_singleton;
 }
-simdutf_warn_unused size_t count_utf16le(const char16_t *input,
-                                         size_t length) noexcept {
-  return get_default_implementation()->count_utf16le(input, length);
-}
-simdutf_warn_unused size_t count_utf16be(const char16_t *input,
-                                         size_t length) noexcept {
-  return get_default_implementation()->count_utf16be(input, length);
-}
-simdutf_warn_unused size_t count_utf8(const char *input,
-                                      size_t length) noexcept {
-  return get_default_implementation()->count_utf8(input, length);
-}
-simdutf_warn_unused size_t latin1_length_from_utf8(const char *buf,
-                                                   size_t len) noexcept {
-  return get_default_implementation()->latin1_length_from_utf8(buf, len);
-}
-simdutf_warn_unused size_t latin1_length_from_utf16(size_t len) noexcept {
-  return get_default_implementation()->latin1_length_from_utf16(len);
-}
-simdutf_warn_unused size_t latin1_length_from_utf32(size_t len) noexcept {
-  return get_default_implementation()->latin1_length_from_utf32(len);
-}
-simdutf_warn_unused size_t utf8_length_from_latin1(const char *buf,
-                                                   size_t len) noexcept {
-  return get_default_implementation()->utf8_length_from_latin1(buf, len);
-}
-simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t *input,
-                                                  size_t length) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return utf8_length_from_utf16be(input, length);
-#else
-  return utf8_length_from_utf16le(input, length);
 #endif
+
+#if SIMDUTF_SINGLE_IMPLEMENTATION
+static const implementation *get_single_implementation() {
+  return
+  #if SIMDUTF_IMPLEMENTATION_ICELAKE
+      get_icelake_singleton();
+  #endif
+  #if SIMDUTF_IMPLEMENTATION_HASWELL
+  get_haswell_singleton();
+  #endif
+  #if SIMDUTF_IMPLEMENTATION_WESTMERE
+  get_westmere_singleton();
+  #endif
+  #if SIMDUTF_IMPLEMENTATION_ARM64
+  get_arm64_singleton();
+  #endif
+  #if SIMDUTF_IMPLEMENTATION_PPC64
+  get_ppc64_singleton();
+  #endif
+  #if SIMDUTF_IMPLEMENTATION_LSX
+  get_lsx_singleton();
+  #endif
+  #if SIMDUTF_IMPLEMENTATION_LASX
+  get_lasx_singleton();
+  #endif
+  #if SIMDUTF_IMPLEMENTATION_FALLBACK
+  get_fallback_singleton();
+  #endif
 }
-simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *input,
-                                                    size_t length) noexcept {
-  return get_default_implementation()->utf8_length_from_utf16le(input, length);
-}
-simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *input,
-                                                    size_t length) noexcept {
-  return get_default_implementation()->utf8_length_from_utf16be(input, length);
-}
-simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t *input,
-                                                   size_t length) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return utf32_length_from_utf16be(input, length);
-#else
-  return utf32_length_from_utf16le(input, length);
 #endif
-}
-simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *input,
-                                                     size_t length) noexcept {
-  return get_default_implementation()->utf32_length_from_utf16le(input, length);
-}
-simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *input,
-                                                     size_t length) noexcept {
-  return get_default_implementation()->utf32_length_from_utf16be(input, length);
-}
-simdutf_warn_unused size_t utf16_length_from_utf8(const char *input,
-                                                  size_t length) noexcept {
-  return get_default_implementation()->utf16_length_from_utf8(input, length);
-}
-simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) noexcept {
-  return get_default_implementation()->utf16_length_from_latin1(length);
-}
-simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *input,
-                                                  size_t length) noexcept {
-  return get_default_implementation()->utf8_length_from_utf32(input, length);
-}
-simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *input,
-                                                   size_t length) noexcept {
-  return get_default_implementation()->utf16_length_from_utf32(input, length);
-}
-simdutf_warn_unused size_t utf32_length_from_utf8(const char *input,
-                                                  size_t length) noexcept {
-  return get_default_implementation()->utf32_length_from_utf8(input, length);
-}
 
-simdutf_warn_unused size_t
-maximal_binary_length_from_base64(const char *input, size_t length) noexcept {
-  return get_default_implementation()->maximal_binary_length_from_base64(
-      input, length);
-}
+/**
+ * @private Detects best supported implementation on first use, and sets it
+ */
+class detect_best_supported_implementation_on_first_use final
+    : public implementation {
+public:
+  std::string name() const noexcept final { return set_best()->name(); }
+  std::string description() const noexcept final {
+    return set_best()->description();
+  }
+  uint32_t required_instruction_sets() const noexcept final {
+    return set_best()->required_instruction_sets();
+  }
 
-simdutf_warn_unused result base64_to_binary(
-    const char *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_handling_options) noexcept {
-  return get_default_implementation()->base64_to_binary(
-      input, length, output, options, last_chunk_handling_options);
-}
+  simdutf_warn_unused int
+  detect_encodings(const char *input, size_t length) const noexcept override {
+    return set_best()->detect_encodings(input, length);
+  }
 
-simdutf_warn_unused size_t maximal_binary_length_from_base64(
-    const char16_t *input, size_t length) noexcept {
-  return get_default_implementation()->maximal_binary_length_from_base64(
-      input, length);
-}
+  simdutf_warn_unused bool
+  validate_utf8(const char *buf, size_t len) const noexcept final override {
+    return set_best()->validate_utf8(buf, len);
+  }
 
-simdutf_warn_unused result base64_to_binary(
-    const char16_t *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_handling_options) noexcept {
-  return get_default_implementation()->base64_to_binary(
-      input, length, output, options, last_chunk_handling_options);
-}
+  simdutf_warn_unused result validate_utf8_with_errors(
+      const char *buf, size_t len) const noexcept final override {
+    return set_best()->validate_utf8_with_errors(buf, len);
+  }
 
-template <typename chartype>
-simdutf_warn_unused result base64_to_binary_safe_impl(
-    const chartype *input, size_t length, char *output, size_t &outlen,
-    base64_options options,
-    last_chunk_handling_options last_chunk_handling_options) noexcept {
-  static_assert(std::is_same<chartype, char>::value ||
-                    std::is_same<chartype, char16_t>::value,
-                "Only char and char16_t are supported.");
-  // The implementation could be nicer, but we expect that most times, the user
-  // will provide us with a buffer that is large enough.
-  size_t max_length = maximal_binary_length_from_base64(input, length);
-  if (outlen >= max_length) {
-    // fast path
-    full_result r = get_default_implementation()->base64_to_binary_details(
-        input, length, output, options, last_chunk_handling_options);
-    if (r.error != error_code::INVALID_BASE64_CHARACTER &&
-        r.error != error_code::BASE64_EXTRA_BITS) {
-      outlen = r.output_count;
-      if (last_chunk_handling_options == stop_before_partial) {
-        if ((r.output_count % 3) != 0) {
-          bool empty_trail = true;
-          for (size_t i = r.input_count; i < length; i++) {
-            if (!scalar::base64::is_ascii_white_space_or_padding(input[i])) {
-              empty_trail = false;
-              break;
-            }
-          }
-          if (empty_trail) {
-            r.input_count = length;
-          }
-        }
-        return {r.error, r.input_count};
-      }
-      return {r.error, length};
-    }
-    return r;
+  simdutf_warn_unused bool
+  validate_ascii(const char *buf, size_t len) const noexcept final override {
+    return set_best()->validate_ascii(buf, len);
   }
-  // The output buffer is maybe too small. We will decode a truncated version of
-  // the input.
-  size_t outlen3 = outlen / 3 * 3; // round down to multiple of 3
-  size_t safe_input = base64_length_from_binary(outlen3, options);
-  full_result r = get_default_implementation()->base64_to_binary_details(
-      input, safe_input, output, options, loose);
-  if (r.error == error_code::INVALID_BASE64_CHARACTER) {
-    return r;
+
+  simdutf_warn_unused result validate_ascii_with_errors(
+      const char *buf, size_t len) const noexcept final override {
+    return set_best()->validate_ascii_with_errors(buf, len);
   }
-  size_t offset =
-      (r.error == error_code::BASE64_INPUT_REMAINDER)
-          ? 1
-          : ((r.output_count % 3) == 0 ? 0 : (r.output_count % 3) + 1);
-  size_t output_index = r.output_count - (r.output_count % 3);
-  size_t input_index = safe_input;
-  // offset is a value that is no larger than 3. We backtrack
-  // by up to offset characters + an undetermined number of
-  // white space characters. It is expected that the next loop
-  // runs at most 3 times + the number of white space characters
-  // in between them, so we are not worried about performance.
-  while (offset > 0 && input_index > 0) {
-    chartype c = input[--input_index];
-    if (scalar::base64::is_ascii_white_space(c)) {
-      // skipping
-    } else {
-      offset--;
-    }
+
+  simdutf_warn_unused bool
+  validate_utf16le(const char16_t *buf,
+                   size_t len) const noexcept final override {
+    return set_best()->validate_utf16le(buf, len);
   }
-  size_t remaining_out = outlen - output_index;
-  const chartype *tail_input = input + input_index;
-  size_t tail_length = length - input_index;
-  while (tail_length > 0 &&
-         scalar::base64::is_ascii_white_space(tail_input[tail_length - 1])) {
-    tail_length--;
+
+  simdutf_warn_unused bool
+  validate_utf16be(const char16_t *buf,
+                   size_t len) const noexcept final override {
+    return set_best()->validate_utf16be(buf, len);
   }
-  size_t padding_characts = 0;
-  if (tail_length > 0 && tail_input[tail_length - 1] == '=') {
-    tail_length--;
-    padding_characts++;
-    while (tail_length > 0 &&
-           scalar::base64::is_ascii_white_space(tail_input[tail_length - 1])) {
-      tail_length--;
-    }
-    if (tail_length > 0 && tail_input[tail_length - 1] == '=') {
-      tail_length--;
-      padding_characts++;
-    }
+
+  simdutf_warn_unused result validate_utf16le_with_errors(
+      const char16_t *buf, size_t len) const noexcept final override {
+    return set_best()->validate_utf16le_with_errors(buf, len);
   }
-  // this will advance tail_input and tail_length
-  result rr = scalar::base64::base64_tail_decode_safe(
-      output + output_index, remaining_out, tail_input, tail_length,
-      padding_characts, options, last_chunk_handling_options);
-  outlen = output_index + remaining_out;
-  if (last_chunk_handling_options != stop_before_partial &&
-      rr.error == error_code::SUCCESS && padding_characts > 0) {
-    // additional checks
-    if ((outlen % 3 == 0) || ((outlen % 3) + 1 + padding_characts != 4)) {
-      rr.error = error_code::INVALID_BASE64_CHARACTER;
-    }
+
+  simdutf_warn_unused result validate_utf16be_with_errors(
+      const char16_t *buf, size_t len) const noexcept final override {
+    return set_best()->validate_utf16be_with_errors(buf, len);
   }
-  if (rr.error == error_code::SUCCESS &&
-      last_chunk_handling_options == stop_before_partial) {
-    if (tail_input > input + input_index) {
-      rr.count = tail_input - input;
-    } else if (r.input_count > 0) {
-      rr.count = r.input_count + rr.count;
-    }
-    return rr;
+
+  simdutf_warn_unused bool
+  validate_utf32(const char32_t *buf,
+                 size_t len) const noexcept final override {
+    return set_best()->validate_utf32(buf, len);
   }
-  rr.count += input_index;
-  return rr;
-}
 
-simdutf_warn_unused size_t convert_latin1_to_utf8_safe(
-    const char *buf, size_t len, char *utf8_output, size_t utf8_len) noexcept {
-  const auto start{utf8_output};
+  simdutf_warn_unused result validate_utf32_with_errors(
+      const char32_t *buf, size_t len) const noexcept final override {
+    return set_best()->validate_utf32_with_errors(buf, len);
+  }
 
-  while (true) {
-    // convert_latin1_to_utf8 will never write more than input length * 2
-    auto read_len = std::min(len, utf8_len >> 1);
-    if (read_len <= 16) {
-      break;
-    }
+  simdutf_warn_unused size_t
+  convert_latin1_to_utf8(const char *buf, size_t len,
+                         char *utf8_output) const noexcept final override {
+    return set_best()->convert_latin1_to_utf8(buf, len, utf8_output);
+  }
 
-    const auto write_len =
-        simdutf::convert_latin1_to_utf8(buf, read_len, utf8_output);
+  simdutf_warn_unused size_t convert_latin1_to_utf16le(
+      const char *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_latin1_to_utf16le(buf, len, utf16_output);
+  }
 
-    utf8_output += write_len;
-    utf8_len -= write_len;
-    buf += read_len;
-    len -= read_len;
+  simdutf_warn_unused size_t convert_latin1_to_utf16be(
+      const char *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_latin1_to_utf16be(buf, len, utf16_output);
   }
 
-  utf8_output +=
-      scalar::latin1_to_utf8::convert_safe(buf, len, utf8_output, utf8_len);
+  simdutf_warn_unused size_t convert_latin1_to_utf32(
+      const char *buf, size_t len,
+      char32_t *latin1_output) const noexcept final override {
+    return set_best()->convert_latin1_to_utf32(buf, len, latin1_output);
+  }
 
-  return utf8_output - start;
-}
+  simdutf_warn_unused size_t
+  convert_utf8_to_latin1(const char *buf, size_t len,
+                         char *latin1_output) const noexcept final override {
+    return set_best()->convert_utf8_to_latin1(buf, len, latin1_output);
+  }
 
-simdutf_warn_unused result base64_to_binary_safe(
-    const char *input, size_t length, char *output, size_t &outlen,
-    base64_options options,
-    last_chunk_handling_options last_chunk_handling_options) noexcept {
-  return base64_to_binary_safe_impl<char>(input, length, output, outlen,
-                                          options, last_chunk_handling_options);
-}
-simdutf_warn_unused result base64_to_binary_safe(
-    const char16_t *input, size_t length, char *output, size_t &outlen,
-    base64_options options,
-    last_chunk_handling_options last_chunk_handling_options) noexcept {
-  return base64_to_binary_safe_impl<char16_t>(
-      input, length, output, outlen, options, last_chunk_handling_options);
-}
+  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
+      const char *buf, size_t len,
+      char *latin1_output) const noexcept final override {
+    return set_best()->convert_utf8_to_latin1_with_errors(buf, len,
+                                                          latin1_output);
+  }
 
-simdutf_warn_unused size_t
-base64_length_from_binary(size_t length, base64_options options) noexcept {
-  return get_default_implementation()->base64_length_from_binary(length,
-                                                                 options);
-}
+  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
+      const char *buf, size_t len,
+      char *latin1_output) const noexcept final override {
+    return set_best()->convert_valid_utf8_to_latin1(buf, len, latin1_output);
+  }
 
-size_t binary_to_base64(const char *input, size_t length, char *output,
-                        base64_options options) noexcept {
-  return get_default_implementation()->binary_to_base64(input, length, output,
-                                                        options);
-}
+  simdutf_warn_unused size_t convert_utf8_to_utf16le(
+      const char *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_utf8_to_utf16le(buf, len, utf16_output);
+  }
 
-simdutf_warn_unused simdutf::encoding_type
-autodetect_encoding(const char *buf, size_t length) noexcept {
-  return get_default_implementation()->autodetect_encoding(buf, length);
-}
-simdutf_warn_unused int detect_encodings(const char *buf,
-                                         size_t length) noexcept {
-  return get_default_implementation()->detect_encodings(buf, length);
-}
-const implementation *builtin_implementation() {
-  static const implementation *builtin_impl =
-      get_available_implementations()[SIMDUTF_STRINGIFY(
-          SIMDUTF_BUILTIN_IMPLEMENTATION)];
-  return builtin_impl;
-}
+  simdutf_warn_unused size_t convert_utf8_to_utf16be(
+      const char *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_utf8_to_utf16be(buf, len, utf16_output);
+  }
 
-simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length) {
-  return scalar::utf8::trim_partial_utf8(input, length);
-}
+  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+      const char *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_utf8_to_utf16le_with_errors(buf, len,
+                                                           utf16_output);
+  }
 
-simdutf_warn_unused size_t trim_partial_utf16be(const char16_t *input,
-                                                size_t length) {
-  return scalar::utf16::trim_partial_utf16<BIG>(input, length);
-}
+  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+      const char *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_utf8_to_utf16be_with_errors(buf, len,
+                                                           utf16_output);
+  }
 
-simdutf_warn_unused size_t trim_partial_utf16le(const char16_t *input,
-                                                size_t length) {
-  return scalar::utf16::trim_partial_utf16<LITTLE>(input, length);
-}
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+      const char *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_valid_utf8_to_utf16le(buf, len, utf16_output);
+  }
 
-simdutf_warn_unused size_t trim_partial_utf16(const char16_t *input,
-                                              size_t length) {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return trim_partial_utf16be(input, length);
-#else
-  return trim_partial_utf16le(input, length);
-#endif
-}
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+      const char *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_valid_utf8_to_utf16be(buf, len, utf16_output);
+  }
 
-} // namespace simdutf
-/* end file src/implementation.cpp */
-/* begin file src/encoding_types.cpp */
+  simdutf_warn_unused size_t
+  convert_utf8_to_utf32(const char *buf, size_t len,
+                        char32_t *utf32_output) const noexcept final override {
+    return set_best()->convert_utf8_to_utf32(buf, len, utf32_output);
+  }
 
-namespace simdutf {
-bool match_system(endianness e) {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return e == endianness::BIG;
-#else
-  return e == endianness::LITTLE;
-#endif
-}
+  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+      const char *buf, size_t len,
+      char32_t *utf32_output) const noexcept final override {
+    return set_best()->convert_utf8_to_utf32_with_errors(buf, len,
+                                                         utf32_output);
+  }
 
-std::string to_string(encoding_type bom) {
-  switch (bom) {
-  case UTF16_LE:
-    return "UTF16 little-endian";
-  case UTF16_BE:
-    return "UTF16 big-endian";
-  case UTF32_LE:
-    return "UTF32 little-endian";
-  case UTF32_BE:
-    return "UTF32 big-endian";
-  case UTF8:
-    return "UTF8";
-  case unspecified:
-    return "unknown";
-  default:
-    return "error";
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+      const char *buf, size_t len,
+      char32_t *utf32_output) const noexcept final override {
+    return set_best()->convert_valid_utf8_to_utf32(buf, len, utf32_output);
   }
-}
 
-namespace BOM {
-// Note that BOM for UTF8 is discouraged.
-encoding_type check_bom(const uint8_t *byte, size_t length) {
-  if (length >= 2 && byte[0] == 0xff and byte[1] == 0xfe) {
-    if (length >= 4 && byte[2] == 0x00 and byte[3] == 0x0) {
-      return encoding_type::UTF32_LE;
-    } else {
-      return encoding_type::UTF16_LE;
-    }
-  } else if (length >= 2 && byte[0] == 0xfe and byte[1] == 0xff) {
-    return encoding_type::UTF16_BE;
-  } else if (length >= 4 && byte[0] == 0x00 and byte[1] == 0x00 and
-             byte[2] == 0xfe and byte[3] == 0xff) {
-    return encoding_type::UTF32_BE;
-  } else if (length >= 4 && byte[0] == 0xef and byte[1] == 0xbb and
-             byte[2] == 0xbf) {
-    return encoding_type::UTF8;
+  simdutf_warn_unused size_t
+  convert_utf16le_to_latin1(const char16_t *buf, size_t len,
+                            char *latin1_output) const noexcept final override {
+    return set_best()->convert_utf16le_to_latin1(buf, len, latin1_output);
   }
-  return encoding_type::unspecified;
-}
 
-encoding_type check_bom(const char *byte, size_t length) {
-  return check_bom(reinterpret_cast<const uint8_t *>(byte), length);
-}
+  simdutf_warn_unused size_t
+  convert_utf16be_to_latin1(const char16_t *buf, size_t len,
+                            char *latin1_output) const noexcept final override {
+    return set_best()->convert_utf16be_to_latin1(buf, len, latin1_output);
+  }
 
-size_t bom_byte_size(encoding_type bom) {
-  switch (bom) {
-  case UTF16_LE:
-    return 2;
-  case UTF16_BE:
-    return 2;
-  case UTF32_LE:
-    return 4;
-  case UTF32_BE:
-    return 4;
-  case UTF8:
-    return 3;
-  case unspecified:
-    return 0;
-  default:
-    return 0;
+  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
+      const char16_t *buf, size_t len,
+      char *latin1_output) const noexcept final override {
+    return set_best()->convert_utf16le_to_latin1_with_errors(buf, len,
+                                                             latin1_output);
   }
-}
 
-} // namespace BOM
-} // namespace simdutf
-/* end file src/encoding_types.cpp */
-/* begin file src/error.cpp */
-namespace simdutf {
-// deliberately empty
-}
-/* end file src/error.cpp */
-// The large tables should be included once and they
-// should not depend on a kernel.
-/* begin file src/tables/utf8_to_utf16_tables.h */
-#ifndef SIMDUTF_UTF8_TO_UTF16_TABLES_H
-#define SIMDUTF_UTF8_TO_UTF16_TABLES_H
-#include <cstdint>
+  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
+      const char16_t *buf, size_t len,
+      char *latin1_output) const noexcept final override {
+    return set_best()->convert_utf16be_to_latin1_with_errors(buf, len,
+                                                             latin1_output);
+  }
 
-namespace simdutf {
-namespace {
-namespace tables {
-namespace utf8_to_utf16 {
-/**
- * utf8bigindex uses about 8 kB
- * shufutf8 uses about 3344 B
- *
- * So we use a bit over 11 kB. It would be
- * easy to save about 4 kB by only
- * storing the index in utf8bigindex, and
- * deriving the consumed bytes otherwise.
- * However, this may come at a significant (10% to 20%)
- * performance penalty.
- */
+  simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
+      const char16_t *buf, size_t len,
+      char *latin1_output) const noexcept final override {
+    return set_best()->convert_valid_utf16le_to_latin1(buf, len, latin1_output);
+  }
 
-const uint8_t shufutf8[209][16] = {
-    {0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 0, 0, 0, 0},
-    {0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 6, 5, 0, 0, 0, 0},
-    {0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 6, 255, 0, 0, 0, 0},
-    {0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 7, 6, 0, 0, 0, 0},
-    {0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 6, 255, 0, 0, 0, 0},
-    {0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 7, 6, 0, 0, 0, 0},
-    {0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 7, 255, 0, 0, 0, 0},
-    {0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 8, 7, 0, 0, 0, 0},
-    {0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
-    {0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
-    {0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
-    {0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
-    {0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
-    {0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
-    {0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
-    {0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
-    {0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
-    {0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
-    {0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
-    {0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
-    {0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
-    {0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
-    {0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
-    {0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
-    {0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
-    {0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
-    {0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
-    {0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
-    {0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
-    {0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
-    {0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
-    {0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
-    {1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
-    {1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
-    {1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
-    {1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
-    {1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
-    {1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
-    {1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
-    {1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
-    {1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
-    {1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
-    {1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
-    {1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
-    {1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
-    {1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
-    {1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
-    {1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
-    {1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
-    {1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
-    {1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
-    {1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
-    {1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
-    {1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
-    {1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
-    {1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
-    {1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 8, 255, 0, 0, 0, 0},
-    {1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 9, 8, 0, 0, 0, 0},
-    {1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 9, 255, 0, 0, 0, 0},
-    {1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 10, 9, 0, 0, 0, 0},
-    {1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 9, 255, 0, 0, 0, 0},
-    {1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 10, 9, 0, 0, 0, 0},
-    {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 255, 0, 0, 0, 0},
-    {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 0, 0, 0, 0},
-    {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255},
-    {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255},
-    {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255},
-    {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255},
-    {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255},
-    {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255},
-    {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255},
-    {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255},
-    {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255},
-    {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
-    {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
-    {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
-    {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
-    {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
-    {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
-    {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
-    {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
-    {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
-    {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 5, 255, 255, 255},
-    {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 6, 5, 255, 255},
-    {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 7, 6, 5, 255},
-    {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 6, 255, 255, 255},
-    {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 7, 6, 255, 255},
-    {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 8, 7, 6, 255},
-    {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 7, 255, 255, 255},
-    {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 8, 7, 255, 255},
-    {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 9, 8, 7, 255},
-    {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
-    {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
-    {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
-    {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
-    {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
-    {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
-    {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
-    {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
-    {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
-    {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
-    {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
-    {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
-    {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
-    {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
-    {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
-    {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
-    {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
-    {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
-    {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 6, 255, 255, 255},
-    {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 7, 6, 255, 255},
-    {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 8, 7, 6, 255},
-    {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 7, 255, 255, 255},
-    {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 8, 7, 255, 255},
-    {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 9, 8, 7, 255},
-    {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 8, 255, 255, 255},
-    {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 9, 8, 255, 255},
-    {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 10, 9, 8, 255},
-    {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
-    {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
-    {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
-    {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
-    {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
-    {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
-    {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
-    {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
-    {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
-    {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255},
-    {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 7, 6, 255, 255},
-    {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 8, 7, 6, 255},
-    {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 7, 255, 255, 255},
-    {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 8, 7, 255, 255},
-    {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 9, 8, 7, 255},
-    {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 8, 255, 255, 255},
-    {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 9, 8, 255, 255},
-    {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 10, 9, 8, 255},
-    {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 7, 255, 255, 255},
-    {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 8, 7, 255, 255},
-    {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 9, 8, 7, 255},
-    {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 8, 255, 255, 255},
-    {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 9, 8, 255, 255},
-    {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 10, 9, 8, 255},
-    {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 9, 255, 255, 255},
-    {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 10, 9, 255, 255},
-    {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255},
-    {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 1, 255, 255, 255, 5, 4, 3, 2, 0, 0, 0, 0},
-    {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 2, 1, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
-    {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 3, 2, 1, 255, 7, 6, 5, 4, 0, 0, 0, 0},
-    {0, 255, 255, 255, 4, 3, 2, 1, 5, 255, 255, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 4, 3, 2, 1, 6, 5, 255, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 4, 3, 2, 1, 7, 6, 5, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 4, 3, 2, 1, 8, 7, 6, 5, 0, 0, 0, 0},
-    {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 2, 255, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
-    {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 3, 2, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
-    {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 4, 3, 2, 255, 8, 7, 6, 5, 0, 0, 0, 0},
-    {1, 0, 255, 255, 5, 4, 3, 2, 6, 255, 255, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 5, 4, 3, 2, 7, 6, 255, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 5, 4, 3, 2, 8, 7, 6, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 5, 4, 3, 2, 9, 8, 7, 6, 0, 0, 0, 0},
-    {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 3, 255, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
-    {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 4, 3, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
-    {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 5, 4, 3, 255, 9, 8, 7, 6, 0, 0, 0, 0},
-    {2, 1, 0, 255, 6, 5, 4, 3, 7, 255, 255, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 6, 5, 4, 3, 8, 7, 255, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 6, 5, 4, 3, 9, 8, 7, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 6, 5, 4, 3, 10, 9, 8, 7, 0, 0, 0, 0},
-    {3, 2, 1, 0, 4, 255, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 4, 255, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 4, 255, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 4, 255, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
-    {3, 2, 1, 0, 5, 4, 255, 255, 6, 255, 255, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 5, 4, 255, 255, 7, 6, 255, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 5, 4, 255, 255, 8, 7, 6, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 5, 4, 255, 255, 9, 8, 7, 6, 0, 0, 0, 0},
-    {3, 2, 1, 0, 6, 5, 4, 255, 7, 255, 255, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 6, 5, 4, 255, 8, 7, 255, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 6, 5, 4, 255, 9, 8, 7, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 6, 5, 4, 255, 10, 9, 8, 7, 0, 0, 0, 0},
-    {3, 2, 1, 0, 7, 6, 5, 4, 8, 255, 255, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 7, 6, 5, 4, 9, 8, 255, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 7, 6, 5, 4, 10, 9, 8, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 0, 0, 0, 0}};
-/* number of two bytes : 64 */
-/* number of two + three bytes : 145 */
-/* number of two + three + four bytes : 209 */
-const uint8_t utf8bigindex[4096][2] = {
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},
-    {161, 4},  {64, 4},   {209, 12}, {209, 12}, {209, 12}, {147, 5},  {209, 12},
-    {150, 5},  {162, 5},  {65, 5},   {209, 12}, {153, 5},  {165, 5},  {67, 5},
-    {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {209, 12},
-    {148, 6},  {209, 12}, {151, 6},  {163, 6},  {66, 6},   {209, 12}, {154, 6},
-    {166, 6},  {68, 6},   {178, 6},  {74, 6},   {92, 6},   {64, 4},   {209, 12},
-    {157, 6},  {169, 6},  {70, 6},   {181, 6},  {76, 6},   {94, 6},   {65, 5},
-    {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},
-    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {152, 7},
-    {164, 7},  {145, 3},  {209, 12}, {155, 7},  {167, 7},  {69, 7},   {179, 7},
-    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {170, 7},  {71, 7},
-    {182, 7},  {77, 7},   {95, 7},   {65, 5},   {194, 7},  {83, 7},   {101, 7},
-    {67, 5},   {119, 7},  {73, 5},   {91, 5},   {1, 7},    {209, 12}, {209, 12},
-    {173, 7},  {148, 6},  {185, 7},  {79, 7},   {97, 7},   {66, 6},   {197, 7},
-    {85, 7},   {103, 7},  {68, 6},   {121, 7},  {74, 6},   {92, 6},   {2, 7},
-    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {76, 6},   {94, 6},
-    {4, 7},    {193, 6},  {82, 6},   {100, 6},  {8, 7},    {118, 6},  {16, 7},
-    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {145, 3},  {209, 12}, {156, 8},  {168, 8},  {146, 4},
-    {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {171, 8},
-    {72, 8},   {183, 8},  {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},
-    {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
-    {209, 12}, {174, 8},  {148, 6},  {186, 8},  {80, 8},   {98, 8},   {66, 6},
-    {198, 8},  {86, 8},   {104, 8},  {68, 6},   {122, 8},  {74, 6},   {92, 6},
-    {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {76, 6},
-    {94, 6},   {5, 8},    {193, 6},  {82, 6},   {100, 6},  {9, 8},    {118, 6},
-    {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},
-    {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
-    {112, 8},  {71, 7},   {130, 8},  {77, 7},   {95, 7},   {6, 8},    {194, 7},
-    {83, 7},   {101, 7},  {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},
-    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},
-    {66, 6},   {197, 7},  {85, 7},   {103, 7},  {12, 8},   {121, 7},  {20, 8},
-    {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
-    {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},
-    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12},
-    {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12},
-    {160, 9},  {172, 9},  {147, 5},  {184, 9},  {150, 5},  {162, 5},  {65, 5},
-    {196, 9},  {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},
-    {64, 4},   {209, 12}, {209, 12}, {175, 9},  {148, 6},  {187, 9},  {81, 9},
-    {99, 9},   {66, 6},   {199, 9},  {87, 9},   {105, 9},  {68, 6},   {123, 9},
-    {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {111, 9},  {70, 6},
-    {129, 9},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},
-    {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {190, 9},  {152, 7},  {164, 7},  {145, 3},  {202, 9},
-    {89, 9},   {107, 9},  {69, 7},   {125, 9},  {75, 7},   {93, 7},   {64, 4},
-    {209, 12}, {158, 7},  {113, 9},  {71, 7},   {131, 9},  {77, 7},   {95, 7},
-    {7, 9},    {194, 7},  {83, 7},   {101, 7},  {11, 9},   {119, 7},  {19, 9},
-    {35, 9},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {137, 9},
-    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},  {13, 9},
-    {121, 7},  {21, 9},   {37, 9},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
-    {70, 6},   {127, 7},  {25, 9},   {41, 9},   {4, 7},    {193, 6},  {82, 6},
-    {49, 9},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
-    {205, 9},  {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},
-    {64, 4},   {209, 12}, {159, 8},  {115, 9},  {72, 8},   {133, 9},  {78, 8},
-    {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},
-    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},
-    {139, 9},  {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {104, 8},
-    {14, 9},   {122, 8},  {22, 9},   {38, 9},   {3, 8},    {209, 12}, {157, 6},
-    {110, 8},  {70, 6},   {128, 8},  {26, 9},   {42, 9},   {5, 8},    {193, 6},
-    {82, 6},   {50, 9},   {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},
-    {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},
-    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},
-    {28, 9},   {44, 9},   {6, 8},    {194, 7},  {83, 7},   {52, 9},   {10, 8},
-    {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
-    {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
-    {56, 9},   {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12},
-    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},
-    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},
-    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12},
-    {149, 4},  {161, 4},  {64, 4},   {209, 12}, {209, 12}, {209, 12}, {147, 5},
-    {209, 12}, {150, 5},  {162, 5},  {65, 5},   {209, 12}, {153, 5},  {165, 5},
-    {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12},
-    {176, 10}, {148, 6},  {188, 10}, {151, 6},  {163, 6},  {66, 6},   {200, 10},
-    {154, 6},  {166, 6},  {68, 6},   {178, 6},  {74, 6},   {92, 6},   {64, 4},
-    {209, 12}, {157, 6},  {169, 6},  {70, 6},   {181, 6},  {76, 6},   {94, 6},
-    {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},
-    {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {191, 10},
-    {152, 7},  {164, 7},  {145, 3},  {203, 10}, {90, 10},  {108, 10}, {69, 7},
-    {126, 10}, {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {114, 10},
-    {71, 7},   {132, 10}, {77, 7},   {95, 7},   {65, 5},   {194, 7},  {83, 7},
-    {101, 7},  {67, 5},   {119, 7},  {73, 5},   {91, 5},   {1, 7},    {209, 12},
-    {209, 12}, {173, 7},  {148, 6},  {138, 10}, {79, 7},   {97, 7},   {66, 6},
-    {197, 7},  {85, 7},   {103, 7},  {68, 6},   {121, 7},  {74, 6},   {92, 6},
-    {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {76, 6},
-    {94, 6},   {4, 7},    {193, 6},  {82, 6},   {100, 6},  {8, 7},    {118, 6},
-    {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {145, 3},  {206, 10}, {156, 8},  {168, 8},
-    {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},
-    {116, 10}, {72, 8},   {134, 10}, {78, 8},   {96, 8},   {65, 5},   {195, 8},
-    {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},
-    {209, 12}, {209, 12}, {174, 8},  {148, 6},  {140, 10}, {80, 8},   {98, 8},
-    {66, 6},   {198, 8},  {86, 8},   {104, 8},  {15, 10},  {122, 8},  {23, 10},
-    {39, 10},  {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},
-    {27, 10},  {43, 10},  {5, 8},    {193, 6},  {82, 6},   {51, 10},  {9, 8},
-    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},
-    {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12},
-    {158, 7},  {112, 8},  {71, 7},   {130, 8},  {29, 10},  {45, 10},  {6, 8},
-    {194, 7},  {83, 7},   {53, 10},  {10, 8},   {119, 7},  {18, 8},   {34, 8},
-    {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},
-    {97, 7},   {66, 6},   {197, 7},  {85, 7},   {57, 10},  {12, 8},   {121, 7},
-    {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},
-    {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},
-    {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12},
-    {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},
-    {209, 12}, {160, 9},  {172, 9},  {147, 5},  {184, 9},  {150, 5},  {162, 5},
-    {65, 5},   {196, 9},  {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},
-    {91, 5},   {64, 4},   {209, 12}, {209, 12}, {175, 9},  {148, 6},  {142, 10},
-    {81, 9},   {99, 9},   {66, 6},   {199, 9},  {87, 9},   {105, 9},  {68, 6},
-    {123, 9},  {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {111, 9},
-    {70, 6},   {129, 9},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},
-    {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {190, 9},  {152, 7},  {164, 7},  {145, 3},
-    {202, 9},  {89, 9},   {107, 9},  {69, 7},   {125, 9},  {75, 7},   {93, 7},
-    {64, 4},   {209, 12}, {158, 7},  {113, 9},  {71, 7},   {131, 9},  {30, 10},
-    {46, 10},  {7, 9},    {194, 7},  {83, 7},   {54, 10},  {11, 9},   {119, 7},
-    {19, 9},   {35, 9},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},
-    {137, 9},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {58, 10},
-    {13, 9},   {121, 7},  {21, 9},   {37, 9},   {2, 7},    {209, 12}, {157, 6},
-    {109, 7},  {70, 6},   {127, 7},  {25, 9},   {41, 9},   {4, 7},    {193, 6},
-    {82, 6},   {49, 9},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {145, 3},  {205, 9},  {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},
-    {161, 4},  {64, 4},   {209, 12}, {159, 8},  {115, 9},  {72, 8},   {133, 9},
-    {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},
-    {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},
-    {148, 6},  {139, 9},  {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},
-    {60, 10},  {14, 9},   {122, 8},  {22, 9},   {38, 9},   {3, 8},    {209, 12},
-    {157, 6},  {110, 8},  {70, 6},   {128, 8},  {26, 9},   {42, 9},   {5, 8},
-    {193, 6},  {82, 6},   {50, 9},   {9, 8},    {118, 6},  {17, 8},   {33, 8},
-    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},
-    {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},
-    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},
-    {130, 8},  {28, 9},   {44, 9},   {6, 8},    {194, 7},  {83, 7},   {52, 9},
-    {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12},
-    {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},
-    {85, 7},   {56, 9},   {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},
-    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},
-    {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},
-    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},
-    {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12}, {209, 12}, {209, 12},
-    {147, 5},  {209, 12}, {150, 5},  {162, 5},  {65, 5},   {209, 12}, {153, 5},
-    {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
-    {209, 12}, {209, 12}, {148, 6},  {209, 12}, {151, 6},  {163, 6},  {66, 6},
-    {209, 12}, {154, 6},  {166, 6},  {68, 6},   {178, 6},  {74, 6},   {92, 6},
-    {64, 4},   {209, 12}, {157, 6},  {169, 6},  {70, 6},   {181, 6},  {76, 6},
-    {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},
-    {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {192, 11}, {152, 7},  {164, 7},  {145, 3},  {204, 11}, {155, 7},  {167, 7},
-    {69, 7},   {179, 7},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
-    {170, 7},  {71, 7},   {182, 7},  {77, 7},   {95, 7},   {65, 5},   {194, 7},
-    {83, 7},   {101, 7},  {67, 5},   {119, 7},  {73, 5},   {91, 5},   {1, 7},
-    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {185, 7},  {79, 7},   {97, 7},
-    {66, 6},   {197, 7},  {85, 7},   {103, 7},  {68, 6},   {121, 7},  {74, 6},
-    {92, 6},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
-    {76, 6},   {94, 6},   {4, 7},    {193, 6},  {82, 6},   {100, 6},  {8, 7},
-    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {207, 11}, {156, 8},
-    {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12},
-    {159, 8},  {117, 11}, {72, 8},   {135, 11}, {78, 8},   {96, 8},   {65, 5},
-    {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},
-    {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},  {141, 11}, {80, 8},
-    {98, 8},   {66, 6},   {198, 8},  {86, 8},   {104, 8},  {68, 6},   {122, 8},
-    {74, 6},   {92, 6},   {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},
-    {128, 8},  {76, 6},   {94, 6},   {5, 8},    {193, 6},  {82, 6},   {100, 6},
-    {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},
-    {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},
-    {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},  {77, 7},   {95, 7},
-    {6, 8},    {194, 7},  {83, 7},   {101, 7},  {10, 8},   {119, 7},  {18, 8},
-    {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},
-    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},  {12, 8},
-    {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
-    {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},
-    {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
-    {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},
-    {64, 4},   {209, 12}, {160, 9},  {172, 9},  {147, 5},  {184, 9},  {150, 5},
-    {162, 5},  {65, 5},   {196, 9},  {153, 5},  {165, 5},  {67, 5},   {177, 5},
-    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {175, 9},  {148, 6},
-    {143, 11}, {81, 9},   {99, 9},   {66, 6},   {199, 9},  {87, 9},   {105, 9},
-    {68, 6},   {123, 9},  {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},
-    {111, 9},  {70, 6},   {129, 9},  {76, 6},   {94, 6},   {65, 5},   {193, 6},
-    {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9},  {152, 7},  {164, 7},
-    {145, 3},  {202, 9},  {89, 9},   {107, 9},  {69, 7},   {125, 9},  {75, 7},
-    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {113, 9},  {71, 7},   {131, 9},
-    {31, 11},  {47, 11},  {7, 9},    {194, 7},  {83, 7},   {55, 11},  {11, 9},
-    {119, 7},  {19, 9},   {35, 9},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
-    {148, 6},  {137, 9},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
-    {59, 11},  {13, 9},   {121, 7},  {21, 9},   {37, 9},   {2, 7},    {209, 12},
-    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {25, 9},   {41, 9},   {4, 7},
-    {193, 6},  {82, 6},   {49, 9},   {8, 7},    {118, 6},  {16, 7},   {32, 7},
-    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {145, 3},  {205, 9},  {156, 8},  {168, 8},  {146, 4},  {180, 8},
-    {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {115, 9},  {72, 8},
-    {133, 9},  {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},
-    {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12},
-    {174, 8},  {148, 6},  {139, 9},  {80, 8},   {98, 8},   {66, 6},   {198, 8},
-    {86, 8},   {61, 11},  {14, 9},   {122, 8},  {22, 9},   {38, 9},   {3, 8},
-    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {26, 9},   {42, 9},
-    {5, 8},    {193, 6},  {82, 6},   {50, 9},   {9, 8},    {118, 6},  {17, 8},
-    {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},
-    {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},
-    {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},
-    {71, 7},   {130, 8},  {28, 9},   {44, 9},   {6, 8},    {194, 7},  {83, 7},
-    {52, 9},   {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12},
-    {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},
-    {197, 7},  {85, 7},   {56, 9},   {12, 8},   {121, 7},  {20, 8},   {36, 8},
-    {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},
-    {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},
-    {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12},
-    {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12}, {209, 12},
-    {209, 12}, {147, 5},  {209, 12}, {150, 5},  {162, 5},  {65, 5},   {209, 12},
-    {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},
-    {209, 12}, {209, 12}, {176, 10}, {148, 6},  {188, 10}, {151, 6},  {163, 6},
-    {66, 6},   {200, 10}, {154, 6},  {166, 6},  {68, 6},   {178, 6},  {74, 6},
-    {92, 6},   {64, 4},   {209, 12}, {157, 6},  {169, 6},  {70, 6},   {181, 6},
-    {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},
-    {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {191, 10}, {152, 7},  {164, 7},  {145, 3},  {203, 10}, {90, 10},
-    {108, 10}, {69, 7},   {126, 10}, {75, 7},   {93, 7},   {64, 4},   {209, 12},
-    {158, 7},  {114, 10}, {71, 7},   {132, 10}, {77, 7},   {95, 7},   {65, 5},
-    {194, 7},  {83, 7},   {101, 7},  {67, 5},   {119, 7},  {73, 5},   {91, 5},
-    {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {138, 10}, {79, 7},
-    {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},  {68, 6},   {121, 7},
-    {74, 6},   {92, 6},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},
-    {127, 7},  {76, 6},   {94, 6},   {4, 7},    {193, 6},  {82, 6},   {100, 6},
-    {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {206, 10},
-    {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},
-    {209, 12}, {159, 8},  {116, 10}, {72, 8},   {134, 10}, {78, 8},   {96, 8},
-    {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},
-    {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},  {140, 10},
-    {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {62, 11},  {15, 10},
-    {122, 8},  {23, 10},  {39, 10},  {3, 8},    {209, 12}, {157, 6},  {110, 8},
-    {70, 6},   {128, 8},  {27, 10},  {43, 10},  {5, 8},    {193, 6},  {82, 6},
-    {51, 10},  {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},
-    {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},
-    {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},  {29, 10},
-    {45, 10},  {6, 8},    {194, 7},  {83, 7},   {53, 10},  {10, 8},   {119, 7},
-    {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},
-    {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {57, 10},
-    {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},
-    {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},
-    {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},
-    {161, 4},  {64, 4},   {209, 12}, {160, 9},  {172, 9},  {147, 5},  {184, 9},
-    {150, 5},  {162, 5},  {65, 5},   {196, 9},  {153, 5},  {165, 5},  {67, 5},
-    {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {175, 9},
-    {148, 6},  {142, 10}, {81, 9},   {99, 9},   {66, 6},   {199, 9},  {87, 9},
-    {105, 9},  {68, 6},   {123, 9},  {74, 6},   {92, 6},   {64, 4},   {209, 12},
-    {157, 6},  {111, 9},  {70, 6},   {129, 9},  {76, 6},   {94, 6},   {65, 5},
-    {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},
-    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9},  {152, 7},
-    {164, 7},  {145, 3},  {202, 9},  {89, 9},   {107, 9},  {69, 7},   {125, 9},
-    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {113, 9},  {71, 7},
-    {131, 9},  {30, 10},  {46, 10},  {7, 9},    {194, 7},  {83, 7},   {54, 10},
-    {11, 9},   {119, 7},  {19, 9},   {35, 9},   {1, 7},    {209, 12}, {209, 12},
-    {173, 7},  {148, 6},  {137, 9},  {79, 7},   {97, 7},   {66, 6},   {197, 7},
-    {85, 7},   {58, 10},  {13, 9},   {121, 7},  {21, 9},   {37, 9},   {2, 7},
-    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {25, 9},   {41, 9},
-    {4, 7},    {193, 6},  {82, 6},   {49, 9},   {8, 7},    {118, 6},  {16, 7},
-    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {145, 3},  {205, 9},  {156, 8},  {168, 8},  {146, 4},
-    {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {115, 9},
-    {72, 8},   {133, 9},  {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},
-    {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
-    {209, 12}, {174, 8},  {148, 6},  {139, 9},  {80, 8},   {98, 8},   {66, 6},
-    {198, 8},  {86, 8},   {60, 10},  {14, 9},   {122, 8},  {22, 9},   {38, 9},
-    {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {26, 9},
-    {42, 9},   {5, 8},    {193, 6},  {82, 6},   {50, 9},   {9, 8},    {118, 6},
-    {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},
-    {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
-    {112, 8},  {71, 7},   {130, 8},  {28, 9},   {44, 9},   {6, 8},    {194, 7},
-    {83, 7},   {52, 9},   {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},
-    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},
-    {66, 6},   {197, 7},  {85, 7},   {56, 9},   {12, 8},   {121, 7},  {20, 8},
-    {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
-    {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},
-    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12},
-    {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12},
-    {209, 12}, {209, 12}, {147, 5},  {209, 12}, {150, 5},  {162, 5},  {65, 5},
-    {209, 12}, {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},
-    {64, 4},   {209, 12}, {209, 12}, {209, 12}, {148, 6},  {209, 12}, {151, 6},
-    {163, 6},  {66, 6},   {209, 12}, {154, 6},  {166, 6},  {68, 6},   {178, 6},
-    {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {169, 6},  {70, 6},
-    {181, 6},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},
-    {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {152, 7},  {164, 7},  {145, 3},  {209, 12},
-    {155, 7},  {167, 7},  {69, 7},   {179, 7},  {75, 7},   {93, 7},   {64, 4},
-    {209, 12}, {158, 7},  {170, 7},  {71, 7},   {182, 7},  {77, 7},   {95, 7},
-    {65, 5},   {194, 7},  {83, 7},   {101, 7},  {67, 5},   {119, 7},  {73, 5},
-    {91, 5},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {185, 7},
-    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},  {68, 6},
-    {121, 7},  {74, 6},   {92, 6},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
-    {70, 6},   {127, 7},  {76, 6},   {94, 6},   {4, 7},    {193, 6},  {82, 6},
-    {100, 6},  {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
-    {208, 12}, {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},
-    {64, 4},   {209, 12}, {159, 8},  {171, 8},  {72, 8},   {183, 8},  {78, 8},
-    {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},
-    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},
-    {186, 8},  {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {104, 8},
-    {68, 6},   {122, 8},  {74, 6},   {92, 6},   {3, 8},    {209, 12}, {157, 6},
-    {110, 8},  {70, 6},   {128, 8},  {76, 6},   {94, 6},   {5, 8},    {193, 6},
-    {82, 6},   {100, 6},  {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},
-    {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},
-    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},
-    {77, 7},   {95, 7},   {6, 8},    {194, 7},  {83, 7},   {101, 7},  {10, 8},
-    {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
-    {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
-    {103, 7},  {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12},
-    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},
-    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},
-    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12},
-    {149, 4},  {161, 4},  {64, 4},   {209, 12}, {160, 9},  {172, 9},  {147, 5},
-    {184, 9},  {150, 5},  {162, 5},  {65, 5},   {196, 9},  {153, 5},  {165, 5},
-    {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12},
-    {175, 9},  {148, 6},  {144, 12}, {81, 9},   {99, 9},   {66, 6},   {199, 9},
-    {87, 9},   {105, 9},  {68, 6},   {123, 9},  {74, 6},   {92, 6},   {64, 4},
-    {209, 12}, {157, 6},  {111, 9},  {70, 6},   {129, 9},  {76, 6},   {94, 6},
-    {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},
-    {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9},
-    {152, 7},  {164, 7},  {145, 3},  {202, 9},  {89, 9},   {107, 9},  {69, 7},
-    {125, 9},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {113, 9},
-    {71, 7},   {131, 9},  {77, 7},   {95, 7},   {7, 9},    {194, 7},  {83, 7},
-    {101, 7},  {11, 9},   {119, 7},  {19, 9},   {35, 9},   {1, 7},    {209, 12},
-    {209, 12}, {173, 7},  {148, 6},  {137, 9},  {79, 7},   {97, 7},   {66, 6},
-    {197, 7},  {85, 7},   {103, 7},  {13, 9},   {121, 7},  {21, 9},   {37, 9},
-    {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {25, 9},
-    {41, 9},   {4, 7},    {193, 6},  {82, 6},   {49, 9},   {8, 7},    {118, 6},
-    {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {145, 3},  {205, 9},  {156, 8},  {168, 8},
-    {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},
-    {115, 9},  {72, 8},   {133, 9},  {78, 8},   {96, 8},   {65, 5},   {195, 8},
-    {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},
-    {209, 12}, {209, 12}, {174, 8},  {148, 6},  {139, 9},  {80, 8},   {98, 8},
-    {66, 6},   {198, 8},  {86, 8},   {104, 8},  {14, 9},   {122, 8},  {22, 9},
-    {38, 9},   {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},
-    {26, 9},   {42, 9},   {5, 8},    {193, 6},  {82, 6},   {50, 9},   {9, 8},
-    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},
-    {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12},
-    {158, 7},  {112, 8},  {71, 7},   {130, 8},  {28, 9},   {44, 9},   {6, 8},
-    {194, 7},  {83, 7},   {52, 9},   {10, 8},   {119, 7},  {18, 8},   {34, 8},
-    {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},
-    {97, 7},   {66, 6},   {197, 7},  {85, 7},   {56, 9},   {12, 8},   {121, 7},
-    {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},
-    {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},
-    {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12},
-    {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},
-    {209, 12}, {209, 12}, {209, 12}, {147, 5},  {209, 12}, {150, 5},  {162, 5},
-    {65, 5},   {209, 12}, {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},
-    {91, 5},   {64, 4},   {209, 12}, {209, 12}, {176, 10}, {148, 6},  {188, 10},
-    {151, 6},  {163, 6},  {66, 6},   {200, 10}, {154, 6},  {166, 6},  {68, 6},
-    {178, 6},  {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {169, 6},
-    {70, 6},   {181, 6},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},
-    {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {191, 10}, {152, 7},  {164, 7},  {145, 3},
-    {203, 10}, {90, 10},  {108, 10}, {69, 7},   {126, 10}, {75, 7},   {93, 7},
-    {64, 4},   {209, 12}, {158, 7},  {114, 10}, {71, 7},   {132, 10}, {77, 7},
-    {95, 7},   {65, 5},   {194, 7},  {83, 7},   {101, 7},  {67, 5},   {119, 7},
-    {73, 5},   {91, 5},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},
-    {138, 10}, {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},
-    {68, 6},   {121, 7},  {74, 6},   {92, 6},   {2, 7},    {209, 12}, {157, 6},
-    {109, 7},  {70, 6},   {127, 7},  {76, 6},   {94, 6},   {4, 7},    {193, 6},
-    {82, 6},   {100, 6},  {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {145, 3},  {206, 10}, {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},
-    {161, 4},  {64, 4},   {209, 12}, {159, 8},  {116, 10}, {72, 8},   {134, 10},
-    {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},
-    {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},
-    {148, 6},  {140, 10}, {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},
-    {63, 12},  {15, 10},  {122, 8},  {23, 10},  {39, 10},  {3, 8},    {209, 12},
-    {157, 6},  {110, 8},  {70, 6},   {128, 8},  {27, 10},  {43, 10},  {5, 8},
-    {193, 6},  {82, 6},   {51, 10},  {9, 8},    {118, 6},  {17, 8},   {33, 8},
-    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},
-    {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},
-    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},
-    {130, 8},  {29, 10},  {45, 10},  {6, 8},    {194, 7},  {83, 7},   {53, 10},
-    {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12},
-    {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},
-    {85, 7},   {57, 10},  {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},
-    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},
-    {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},
-    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},
-    {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12}, {160, 9},  {172, 9},
-    {147, 5},  {184, 9},  {150, 5},  {162, 5},  {65, 5},   {196, 9},  {153, 5},
-    {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
-    {209, 12}, {175, 9},  {148, 6},  {142, 10}, {81, 9},   {99, 9},   {66, 6},
-    {199, 9},  {87, 9},   {105, 9},  {68, 6},   {123, 9},  {74, 6},   {92, 6},
-    {64, 4},   {209, 12}, {157, 6},  {111, 9},  {70, 6},   {129, 9},  {76, 6},
-    {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},
-    {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {190, 9},  {152, 7},  {164, 7},  {145, 3},  {202, 9},  {89, 9},   {107, 9},
-    {69, 7},   {125, 9},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
-    {113, 9},  {71, 7},   {131, 9},  {30, 10},  {46, 10},  {7, 9},    {194, 7},
-    {83, 7},   {54, 10},  {11, 9},   {119, 7},  {19, 9},   {35, 9},   {1, 7},
-    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {137, 9},  {79, 7},   {97, 7},
-    {66, 6},   {197, 7},  {85, 7},   {58, 10},  {13, 9},   {121, 7},  {21, 9},
-    {37, 9},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
-    {25, 9},   {41, 9},   {4, 7},    {193, 6},  {82, 6},   {49, 9},   {8, 7},
-    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {205, 9},  {156, 8},
-    {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12},
-    {159, 8},  {115, 9},  {72, 8},   {133, 9},  {78, 8},   {96, 8},   {65, 5},
-    {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},
-    {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},  {139, 9},  {80, 8},
-    {98, 8},   {66, 6},   {198, 8},  {86, 8},   {60, 10},  {14, 9},   {122, 8},
-    {22, 9},   {38, 9},   {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},
-    {128, 8},  {26, 9},   {42, 9},   {5, 8},    {193, 6},  {82, 6},   {50, 9},
-    {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},
-    {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},
-    {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},  {28, 9},   {44, 9},
-    {6, 8},    {194, 7},  {83, 7},   {52, 9},   {10, 8},   {119, 7},  {18, 8},
-    {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},
-    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {56, 9},   {12, 8},
-    {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
-    {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},
-    {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
-    {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},
-    {64, 4},   {209, 12}, {209, 12}, {209, 12}, {147, 5},  {209, 12}, {150, 5},
-    {162, 5},  {65, 5},   {209, 12}, {153, 5},  {165, 5},  {67, 5},   {177, 5},
-    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {209, 12}, {148, 6},
-    {209, 12}, {151, 6},  {163, 6},  {66, 6},   {209, 12}, {154, 6},  {166, 6},
-    {68, 6},   {178, 6},  {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},
-    {169, 6},  {70, 6},   {181, 6},  {76, 6},   {94, 6},   {65, 5},   {193, 6},
-    {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {192, 11}, {152, 7},  {164, 7},
-    {145, 3},  {204, 11}, {155, 7},  {167, 7},  {69, 7},   {179, 7},  {75, 7},
-    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {170, 7},  {71, 7},   {182, 7},
-    {77, 7},   {95, 7},   {65, 5},   {194, 7},  {83, 7},   {101, 7},  {67, 5},
-    {119, 7},  {73, 5},   {91, 5},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
-    {148, 6},  {185, 7},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
-    {103, 7},  {68, 6},   {121, 7},  {74, 6},   {92, 6},   {2, 7},    {209, 12},
-    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {76, 6},   {94, 6},   {4, 7},
-    {193, 6},  {82, 6},   {100, 6},  {8, 7},    {118, 6},  {16, 7},   {32, 7},
-    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {145, 3},  {207, 11}, {156, 8},  {168, 8},  {146, 4},  {180, 8},
-    {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {117, 11}, {72, 8},
-    {135, 11}, {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},
-    {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12},
-    {174, 8},  {148, 6},  {141, 11}, {80, 8},   {98, 8},   {66, 6},   {198, 8},
-    {86, 8},   {104, 8},  {68, 6},   {122, 8},  {74, 6},   {92, 6},   {3, 8},
-    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {76, 6},   {94, 6},
-    {5, 8},    {193, 6},  {82, 6},   {100, 6},  {9, 8},    {118, 6},  {17, 8},
-    {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},
-    {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},
-    {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},
-    {71, 7},   {130, 8},  {77, 7},   {95, 7},   {6, 8},    {194, 7},  {83, 7},
-    {101, 7},  {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12},
-    {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},
-    {197, 7},  {85, 7},   {103, 7},  {12, 8},   {121, 7},  {20, 8},   {36, 8},
-    {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},
-    {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},
-    {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12},
-    {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12}, {160, 9},
-    {172, 9},  {147, 5},  {184, 9},  {150, 5},  {162, 5},  {65, 5},   {196, 9},
-    {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},
-    {209, 12}, {209, 12}, {175, 9},  {148, 6},  {143, 11}, {81, 9},   {99, 9},
-    {66, 6},   {199, 9},  {87, 9},   {105, 9},  {68, 6},   {123, 9},  {74, 6},
-    {92, 6},   {64, 4},   {209, 12}, {157, 6},  {111, 9},  {70, 6},   {129, 9},
-    {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},
-    {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {190, 9},  {152, 7},  {164, 7},  {145, 3},  {202, 9},  {89, 9},
-    {107, 9},  {69, 7},   {125, 9},  {75, 7},   {93, 7},   {64, 4},   {209, 12},
-    {158, 7},  {113, 9},  {71, 7},   {131, 9},  {31, 11},  {47, 11},  {7, 9},
-    {194, 7},  {83, 7},   {55, 11},  {11, 9},   {119, 7},  {19, 9},   {35, 9},
-    {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {137, 9},  {79, 7},
-    {97, 7},   {66, 6},   {197, 7},  {85, 7},   {59, 11},  {13, 9},   {121, 7},
-    {21, 9},   {37, 9},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},
-    {127, 7},  {25, 9},   {41, 9},   {4, 7},    {193, 6},  {82, 6},   {49, 9},
-    {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {205, 9},
-    {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},
-    {209, 12}, {159, 8},  {115, 9},  {72, 8},   {133, 9},  {78, 8},   {96, 8},
-    {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},
-    {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},  {139, 9},
-    {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {61, 11},  {14, 9},
-    {122, 8},  {22, 9},   {38, 9},   {3, 8},    {209, 12}, {157, 6},  {110, 8},
-    {70, 6},   {128, 8},  {26, 9},   {42, 9},   {5, 8},    {193, 6},  {82, 6},
-    {50, 9},   {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},
-    {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},
-    {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},  {28, 9},
-    {44, 9},   {6, 8},    {194, 7},  {83, 7},   {52, 9},   {10, 8},   {119, 7},
-    {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},
-    {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {56, 9},
-    {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},
-    {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},
-    {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},
-    {161, 4},  {64, 4},   {209, 12}, {209, 12}, {209, 12}, {147, 5},  {209, 12},
-    {150, 5},  {162, 5},  {65, 5},   {209, 12}, {153, 5},  {165, 5},  {67, 5},
-    {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {176, 10},
-    {148, 6},  {188, 10}, {151, 6},  {163, 6},  {66, 6},   {200, 10}, {154, 6},
-    {166, 6},  {68, 6},   {178, 6},  {74, 6},   {92, 6},   {64, 4},   {209, 12},
-    {157, 6},  {169, 6},  {70, 6},   {181, 6},  {76, 6},   {94, 6},   {65, 5},
-    {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},
-    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {191, 10}, {152, 7},
-    {164, 7},  {145, 3},  {203, 10}, {90, 10},  {108, 10}, {69, 7},   {126, 10},
-    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {114, 10}, {71, 7},
-    {132, 10}, {77, 7},   {95, 7},   {65, 5},   {194, 7},  {83, 7},   {101, 7},
-    {67, 5},   {119, 7},  {73, 5},   {91, 5},   {1, 7},    {209, 12}, {209, 12},
-    {173, 7},  {148, 6},  {138, 10}, {79, 7},   {97, 7},   {66, 6},   {197, 7},
-    {85, 7},   {103, 7},  {68, 6},   {121, 7},  {74, 6},   {92, 6},   {2, 7},
-    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {76, 6},   {94, 6},
-    {4, 7},    {193, 6},  {82, 6},   {100, 6},  {8, 7},    {118, 6},  {16, 7},
-    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {145, 3},  {206, 10}, {156, 8},  {168, 8},  {146, 4},
-    {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {116, 10},
-    {72, 8},   {134, 10}, {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},
-    {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
-    {209, 12}, {174, 8},  {148, 6},  {140, 10}, {80, 8},   {98, 8},   {66, 6},
-    {198, 8},  {86, 8},   {62, 11},  {15, 10},  {122, 8},  {23, 10},  {39, 10},
-    {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {27, 10},
-    {43, 10},  {5, 8},    {193, 6},  {82, 6},   {51, 10},  {9, 8},    {118, 6},
-    {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},
-    {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
-    {112, 8},  {71, 7},   {130, 8},  {29, 10},  {45, 10},  {6, 8},    {194, 7},
-    {83, 7},   {53, 10},  {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},
-    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},
-    {66, 6},   {197, 7},  {85, 7},   {57, 10},  {12, 8},   {121, 7},  {20, 8},
-    {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
-    {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},
-    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12},
-    {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12},
-    {160, 9},  {172, 9},  {147, 5},  {184, 9},  {150, 5},  {162, 5},  {65, 5},
-    {196, 9},  {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},
-    {64, 4},   {209, 12}, {209, 12}, {175, 9},  {148, 6},  {142, 10}, {81, 9},
-    {99, 9},   {66, 6},   {199, 9},  {87, 9},   {105, 9},  {68, 6},   {123, 9},
-    {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {111, 9},  {70, 6},
-    {129, 9},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},
-    {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {190, 9},  {152, 7},  {164, 7},  {145, 3},  {202, 9},
-    {89, 9},   {107, 9},  {69, 7},   {125, 9},  {75, 7},   {93, 7},   {64, 4},
-    {209, 12}, {158, 7},  {113, 9},  {71, 7},   {131, 9},  {30, 10},  {46, 10},
-    {7, 9},    {194, 7},  {83, 7},   {54, 10},  {11, 9},   {119, 7},  {19, 9},
-    {35, 9},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {137, 9},
-    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {58, 10},  {13, 9},
-    {121, 7},  {21, 9},   {37, 9},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
-    {70, 6},   {127, 7},  {25, 9},   {41, 9},   {4, 7},    {193, 6},  {82, 6},
-    {49, 9},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
-    {205, 9},  {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},
-    {64, 4},   {209, 12}, {159, 8},  {115, 9},  {72, 8},   {133, 9},  {78, 8},
-    {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},
-    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},
-    {139, 9},  {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {60, 10},
-    {14, 9},   {122, 8},  {22, 9},   {38, 9},   {3, 8},    {209, 12}, {157, 6},
-    {110, 8},  {70, 6},   {128, 8},  {26, 9},   {42, 9},   {5, 8},    {193, 6},
-    {82, 6},   {50, 9},   {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},
-    {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},
-    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},
-    {28, 9},   {44, 9},   {6, 8},    {194, 7},  {83, 7},   {52, 9},   {10, 8},
-    {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
-    {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
-    {56, 9},   {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12},
-    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},
-    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},
-    {0, 6}};
-} // namespace utf8_to_utf16
-} // namespace tables
-} // unnamed namespace
-} // namespace simdutf
+  simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
+      const char16_t *buf, size_t len,
+      char *latin1_output) const noexcept final override {
+    return set_best()->convert_valid_utf16be_to_latin1(buf, len, latin1_output);
+  }
 
-#endif // SIMDUTF_UTF8_TO_UTF16_TABLES_H
-/* end file src/tables/utf8_to_utf16_tables.h */
-/* begin file src/tables/utf16_to_utf8_tables.h */
-// file generated by scripts/sse_convert_utf16_to_utf8.py
-#ifndef SIMDUTF_UTF16_TO_UTF8_TABLES_H
-#define SIMDUTF_UTF16_TO_UTF8_TABLES_H
+  simdutf_warn_unused size_t
+  convert_utf16le_to_utf8(const char16_t *buf, size_t len,
+                          char *utf8_output) const noexcept final override {
+    return set_best()->convert_utf16le_to_utf8(buf, len, utf8_output);
+  }
 
-namespace simdutf {
-namespace {
-namespace tables {
-namespace utf16_to_utf8 {
+  simdutf_warn_unused size_t
+  convert_utf16be_to_utf8(const char16_t *buf, size_t len,
+                          char *utf8_output) const noexcept final override {
+    return set_best()->convert_utf16be_to_utf8(buf, len, utf8_output);
+  }
 
-// 1 byte for length, 16 bytes for mask
+  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+      const char16_t *buf, size_t len,
+      char *utf8_output) const noexcept final override {
+    return set_best()->convert_utf16le_to_utf8_with_errors(buf, len,
+                                                           utf8_output);
+  }
+
+  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+      const char16_t *buf, size_t len,
+      char *utf8_output) const noexcept final override {
+    return set_best()->convert_utf16be_to_utf8_with_errors(buf, len,
+                                                           utf8_output);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+      const char16_t *buf, size_t len,
+      char *utf8_output) const noexcept final override {
+    return set_best()->convert_valid_utf16le_to_utf8(buf, len, utf8_output);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+      const char16_t *buf, size_t len,
+      char *utf8_output) const noexcept final override {
+    return set_best()->convert_valid_utf16be_to_utf8(buf, len, utf8_output);
+  }
+
+  simdutf_warn_unused size_t
+  convert_utf32_to_latin1(const char32_t *buf, size_t len,
+                          char *latin1_output) const noexcept final override {
+    return set_best()->convert_utf32_to_latin1(buf, len, latin1_output);
+  }
+
+  simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
+      const char32_t *buf, size_t len,
+      char *latin1_output) const noexcept final override {
+    return set_best()->convert_utf32_to_latin1_with_errors(buf, len,
+                                                           latin1_output);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
+      const char32_t *buf, size_t len,
+      char *latin1_output) const noexcept final override {
+    return set_best()->convert_utf32_to_latin1(buf, len, latin1_output);
+  }
+
+  simdutf_warn_unused size_t
+  convert_utf32_to_utf8(const char32_t *buf, size_t len,
+                        char *utf8_output) const noexcept final override {
+    return set_best()->convert_utf32_to_utf8(buf, len, utf8_output);
+  }
+
+  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+      const char32_t *buf, size_t len,
+      char *utf8_output) const noexcept final override {
+    return set_best()->convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+  }
+
+  simdutf_warn_unused size_t
+  convert_valid_utf32_to_utf8(const char32_t *buf, size_t len,
+                              char *utf8_output) const noexcept final override {
+    return set_best()->convert_valid_utf32_to_utf8(buf, len, utf8_output);
+  }
+
+  simdutf_warn_unused size_t convert_utf32_to_utf16le(
+      const char32_t *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_utf32_to_utf16le(buf, len, utf16_output);
+  }
+
+  simdutf_warn_unused size_t convert_utf32_to_utf16be(
+      const char32_t *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_utf32_to_utf16be(buf, len, utf16_output);
+  }
+
+  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+      const char32_t *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_utf32_to_utf16le_with_errors(buf, len,
+                                                            utf16_output);
+  }
+
+  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+      const char32_t *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_utf32_to_utf16be_with_errors(buf, len,
+                                                            utf16_output);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
+      const char32_t *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_valid_utf32_to_utf16le(buf, len, utf16_output);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
+      const char32_t *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_valid_utf32_to_utf16be(buf, len, utf16_output);
+  }
+
+  simdutf_warn_unused size_t convert_utf16le_to_utf32(
+      const char16_t *buf, size_t len,
+      char32_t *utf32_output) const noexcept final override {
+    return set_best()->convert_utf16le_to_utf32(buf, len, utf32_output);
+  }
+
+  simdutf_warn_unused size_t convert_utf16be_to_utf32(
+      const char16_t *buf, size_t len,
+      char32_t *utf32_output) const noexcept final override {
+    return set_best()->convert_utf16be_to_utf32(buf, len, utf32_output);
+  }
+
+  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
+      const char16_t *buf, size_t len,
+      char32_t *utf32_output) const noexcept final override {
+    return set_best()->convert_utf16le_to_utf32_with_errors(buf, len,
+                                                            utf32_output);
+  }
+
+  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
+      const char16_t *buf, size_t len,
+      char32_t *utf32_output) const noexcept final override {
+    return set_best()->convert_utf16be_to_utf32_with_errors(buf, len,
+                                                            utf32_output);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
+      const char16_t *buf, size_t len,
+      char32_t *utf32_output) const noexcept final override {
+    return set_best()->convert_valid_utf16le_to_utf32(buf, len, utf32_output);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
+      const char16_t *buf, size_t len,
+      char32_t *utf32_output) const noexcept final override {
+    return set_best()->convert_valid_utf16be_to_utf32(buf, len, utf32_output);
+  }
+
+  void change_endianness_utf16(const char16_t *buf, size_t len,
+                               char16_t *output) const noexcept final override {
+    set_best()->change_endianness_utf16(buf, len, output);
+  }
+
+  simdutf_warn_unused size_t
+  count_utf16le(const char16_t *buf, size_t len) const noexcept final override {
+    return set_best()->count_utf16le(buf, len);
+  }
+
+  simdutf_warn_unused size_t
+  count_utf16be(const char16_t *buf, size_t len) const noexcept final override {
+    return set_best()->count_utf16be(buf, len);
+  }
+
+  simdutf_warn_unused size_t
+  count_utf8(const char *buf, size_t len) const noexcept final override {
+    return set_best()->count_utf8(buf, len);
+  }
+
+  simdutf_warn_unused size_t
+  latin1_length_from_utf8(const char *buf, size_t len) const noexcept override {
+    return set_best()->latin1_length_from_utf8(buf, len);
+  }
+
+  simdutf_warn_unused size_t
+  latin1_length_from_utf16(size_t len) const noexcept override {
+    return set_best()->latin1_length_from_utf16(len);
+  }
+
+  simdutf_warn_unused size_t
+  latin1_length_from_utf32(size_t len) const noexcept override {
+    return set_best()->latin1_length_from_utf32(len);
+  }
+
+  simdutf_warn_unused size_t
+  utf8_length_from_latin1(const char *buf, size_t len) const noexcept override {
+    return set_best()->utf8_length_from_latin1(buf, len);
+  }
+
+  simdutf_warn_unused size_t utf8_length_from_utf16le(
+      const char16_t *buf, size_t len) const noexcept override {
+    return set_best()->utf8_length_from_utf16le(buf, len);
+  }
+
+  simdutf_warn_unused size_t utf8_length_from_utf16be(
+      const char16_t *buf, size_t len) const noexcept override {
+    return set_best()->utf8_length_from_utf16be(buf, len);
+  }
+
+  simdutf_warn_unused size_t
+  utf16_length_from_latin1(size_t len) const noexcept override {
+    return set_best()->utf16_length_from_latin1(len);
+  }
+
+  simdutf_warn_unused size_t
+  utf32_length_from_latin1(size_t len) const noexcept override {
+    return set_best()->utf32_length_from_latin1(len);
+  }
+
+  simdutf_warn_unused size_t utf32_length_from_utf16le(
+      const char16_t *buf, size_t len) const noexcept override {
+    return set_best()->utf32_length_from_utf16le(buf, len);
+  }
+
+  simdutf_warn_unused size_t utf32_length_from_utf16be(
+      const char16_t *buf, size_t len) const noexcept override {
+    return set_best()->utf32_length_from_utf16be(buf, len);
+  }
+
+  simdutf_warn_unused size_t
+  utf16_length_from_utf8(const char *buf, size_t len) const noexcept override {
+    return set_best()->utf16_length_from_utf8(buf, len);
+  }
+
+  simdutf_warn_unused size_t utf8_length_from_utf32(
+      const char32_t *buf, size_t len) const noexcept override {
+    return set_best()->utf8_length_from_utf32(buf, len);
+  }
+
+  simdutf_warn_unused size_t utf16_length_from_utf32(
+      const char32_t *buf, size_t len) const noexcept override {
+    return set_best()->utf16_length_from_utf32(buf, len);
+  }
+
+  simdutf_warn_unused size_t
+  utf32_length_from_utf8(const char *buf, size_t len) const noexcept override {
+    return set_best()->utf32_length_from_utf8(buf, len);
+  }
+
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(
+      const char *input, size_t length) const noexcept override {
+    return set_best()->maximal_binary_length_from_base64(input, length);
+  }
+
+  simdutf_warn_unused result base64_to_binary(
+      const char *input, size_t length, char *output, base64_options options,
+      last_chunk_handling_options last_chunk_handling_options =
+          last_chunk_handling_options::loose) const noexcept override {
+    return set_best()->base64_to_binary(input, length, output, options,
+                                        last_chunk_handling_options);
+  }
+
+  simdutf_warn_unused full_result base64_to_binary_details(
+      const char *input, size_t length, char *output, base64_options options,
+      last_chunk_handling_options last_chunk_handling_options =
+          last_chunk_handling_options::loose) const noexcept override {
+    return set_best()->base64_to_binary_details(input, length, output, options,
+                                                last_chunk_handling_options);
+  }
+
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(
+      const char16_t *input, size_t length) const noexcept override {
+    return set_best()->maximal_binary_length_from_base64(input, length);
+  }
+
+  simdutf_warn_unused result base64_to_binary(
+      const char16_t *input, size_t length, char *output,
+      base64_options options,
+      last_chunk_handling_options last_chunk_handling_options =
+          last_chunk_handling_options::loose) const noexcept override {
+    return set_best()->base64_to_binary(input, length, output, options,
+                                        last_chunk_handling_options);
+  }
+
+  simdutf_warn_unused full_result base64_to_binary_details(
+      const char16_t *input, size_t length, char *output,
+      base64_options options,
+      last_chunk_handling_options last_chunk_handling_options =
+          last_chunk_handling_options::loose) const noexcept override {
+    return set_best()->base64_to_binary_details(input, length, output, options,
+                                                last_chunk_handling_options);
+  }
+
+  simdutf_warn_unused size_t base64_length_from_binary(
+      size_t length, base64_options options) const noexcept override {
+    return set_best()->base64_length_from_binary(length, options);
+  }
+
+  size_t binary_to_base64(const char *input, size_t length, char *output,
+                          base64_options options) const noexcept override {
+    return set_best()->binary_to_base64(input, length, output, options);
+  }
+
+  simdutf_really_inline
+  detect_best_supported_implementation_on_first_use() noexcept
+      : implementation("best_supported_detector",
+                       "Detects the best supported implementation and sets it",
+                       0) {}
+
+private:
+  const implementation *set_best() const noexcept;
+};
+
+static_assert(std::is_trivially_destructible<
+                  detect_best_supported_implementation_on_first_use>::value,
+              "detect_best_supported_implementation_on_first_use should be "
+              "trivially destructible");
+
+static const std::initializer_list<const implementation *> &
+get_available_implementation_pointers() {
+  static const std::initializer_list<const implementation *>
+      available_implementation_pointers{
+#if SIMDUTF_IMPLEMENTATION_ICELAKE
+          get_icelake_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_HASWELL
+          get_haswell_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_WESTMERE
+          get_westmere_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_ARM64
+          get_arm64_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_PPC64
+          get_ppc64_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_RVV
+          get_rvv_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_LSX
+          get_lsx_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_LASX
+          get_lasx_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_FALLBACK
+          get_fallback_singleton(),
+#endif
+      }; // available_implementation_pointers
+  return available_implementation_pointers;
+}
+
+// So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no
+// support
+class unsupported_implementation final : public implementation {
+public:
+  simdutf_warn_unused int detect_encodings(const char *,
+                                           size_t) const noexcept override {
+    return encoding_type::unspecified;
+  }
+
+  simdutf_warn_unused bool validate_utf8(const char *,
+                                         size_t) const noexcept final override {
+    return false; // Just refuse to validate. Given that we have a fallback
+                  // implementation
+    // it seems unlikely that unsupported_implementation will ever be used. If
+    // it is used, then it will flag all strings as invalid. The alternative is
+    // to return an error_code from which the user has to figure out whether the
+    // string is valid UTF-8... which seems like a lot of work just to handle
+    // the very unlikely case that we have an unsupported implementation. And,
+    // when it does happen (that we have an unsupported implementation), what
+    // are the chances that the programmer has a fallback? Given that *we*
+    // provide the fallback, it implies that the programmer would need a
+    // fallback for our fallback.
+  }
+
+  simdutf_warn_unused result validate_utf8_with_errors(
+      const char *, size_t) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused bool
+  validate_ascii(const char *, size_t) const noexcept final override {
+    return false;
+  }
+
+  simdutf_warn_unused result validate_ascii_with_errors(
+      const char *, size_t) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused bool
+  validate_utf16le(const char16_t *, size_t) const noexcept final override {
+    return false;
+  }
+
+  simdutf_warn_unused bool
+  validate_utf16be(const char16_t *, size_t) const noexcept final override {
+    return false;
+  }
+
+  simdutf_warn_unused result validate_utf16le_with_errors(
+      const char16_t *, size_t) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused result validate_utf16be_with_errors(
+      const char16_t *, size_t) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused bool
+  validate_utf32(const char32_t *, size_t) const noexcept final override {
+    return false;
+  }
+
+  simdutf_warn_unused result validate_utf32_with_errors(
+      const char32_t *, size_t) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_latin1_to_utf8(
+      const char *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_latin1_to_utf16le(
+      const char *, size_t, char16_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_latin1_to_utf16be(
+      const char *, size_t, char16_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_latin1_to_utf32(
+      const char *, size_t, char32_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf8_to_latin1(
+      const char *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
+      const char *, size_t, char *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
+      const char *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf8_to_utf16le(
+      const char *, size_t, char16_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf8_to_utf16be(
+      const char *, size_t, char16_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+      const char *, size_t, char16_t *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+      const char *, size_t, char16_t *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+      const char *, size_t, char16_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+      const char *, size_t, char16_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf8_to_utf32(
+      const char *, size_t, char32_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+      const char *, size_t, char32_t *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+      const char *, size_t, char32_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf16le_to_latin1(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf16be_to_latin1(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf16le_to_utf8(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf16be_to_utf8(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf32_to_latin1(
+      const char32_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
+      const char32_t *, size_t, char *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
+      const char32_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf32_to_utf8(
+      const char32_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+      const char32_t *, size_t, char *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
+      const char32_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf32_to_utf16le(
+      const char32_t *, size_t, char16_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf32_to_utf16be(
+      const char32_t *, size_t, char16_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+      const char32_t *, size_t, char16_t *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+      const char32_t *, size_t, char16_t *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
+      const char32_t *, size_t, char16_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
+      const char32_t *, size_t, char16_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf16le_to_utf32(
+      const char16_t *, size_t, char32_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf16be_to_utf32(
+      const char16_t *, size_t, char32_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
+      const char16_t *, size_t, char32_t *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
+      const char16_t *, size_t, char32_t *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
+      const char16_t *, size_t, char32_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
+      const char16_t *, size_t, char32_t *) const noexcept final override {
+    return 0;
+  }
+
+  void change_endianness_utf16(const char16_t *, size_t,
+                               char16_t *) const noexcept final override {}
+
+  simdutf_warn_unused size_t
+  count_utf16le(const char16_t *, size_t) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  count_utf16be(const char16_t *, size_t) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t count_utf8(const char *,
+                                        size_t) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  latin1_length_from_utf8(const char *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  latin1_length_from_utf16(size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  latin1_length_from_utf32(size_t) const noexcept override {
+    return 0;
+  }
+  simdutf_warn_unused size_t
+  utf8_length_from_latin1(const char *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  utf8_length_from_utf16le(const char16_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  utf8_length_from_utf16be(const char16_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  utf32_length_from_utf16le(const char16_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  utf32_length_from_utf16be(const char16_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  utf32_length_from_latin1(size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  utf16_length_from_utf8(const char *, size_t) const noexcept override {
+    return 0;
+  }
+  simdutf_warn_unused size_t
+  utf16_length_from_latin1(size_t) const noexcept override {
+    return 0;
+  }
+  simdutf_warn_unused size_t
+  utf8_length_from_utf32(const char32_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  utf16_length_from_utf32(const char32_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  utf32_length_from_utf8(const char *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(
+      const char *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused result
+  base64_to_binary(const char *, size_t, char *, base64_options,
+                   last_chunk_handling_options) const noexcept override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused full_result base64_to_binary_details(
+      const char *, size_t, char *, base64_options,
+      last_chunk_handling_options) const noexcept override {
+    return full_result(error_code::OTHER, 0, 0);
+  }
+
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(
+      const char16_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused result
+  base64_to_binary(const char16_t *, size_t, char *, base64_options,
+                   last_chunk_handling_options) const noexcept override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused full_result base64_to_binary_details(
+      const char16_t *, size_t, char *, base64_options,
+      last_chunk_handling_options) const noexcept override {
+    return full_result(error_code::OTHER, 0, 0);
+  }
+
+  simdutf_warn_unused size_t
+  base64_length_from_binary(size_t, base64_options) const noexcept override {
+    return 0;
+  }
+
+  size_t binary_to_base64(const char *, size_t, char *,
+                          base64_options) const noexcept override {
+    return 0;
+  }
+
+  unsupported_implementation()
+      : implementation("unsupported",
+                       "Unsupported CPU (no detected SIMD instructions)", 0) {}
+};
+
+const unsupported_implementation *get_unsupported_singleton() {
+  static const unsupported_implementation unsupported_singleton{};
+  return &unsupported_singleton;
+}
+static_assert(std::is_trivially_destructible<unsupported_implementation>::value,
+              "unsupported_singleton should be trivially destructible");
+
+size_t available_implementation_list::size() const noexcept {
+  return internal::get_available_implementation_pointers().size();
+}
+const implementation *const *
+available_implementation_list::begin() const noexcept {
+  return internal::get_available_implementation_pointers().begin();
+}
+const implementation *const *
+available_implementation_list::end() const noexcept {
+  return internal::get_available_implementation_pointers().end();
+}
+const implementation *
+available_implementation_list::detect_best_supported() const noexcept {
+  // They are prelisted in priority order, so we just go down the list
+  uint32_t supported_instruction_sets =
+      internal::detect_supported_architectures();
+  for (const implementation *impl :
+       internal::get_available_implementation_pointers()) {
+    uint32_t required_instruction_sets = impl->required_instruction_sets();
+    if ((supported_instruction_sets & required_instruction_sets) ==
+        required_instruction_sets) {
+      return impl;
+    }
+  }
+  return get_unsupported_singleton(); // this should never happen?
+}
+
+const implementation *
+detect_best_supported_implementation_on_first_use::set_best() const noexcept {
+  SIMDUTF_PUSH_DISABLE_WARNINGS
+  SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC:
+                                     // manually verified this is safe
+      char *force_implementation_name = getenv("SIMDUTF_FORCE_IMPLEMENTATION");
+  SIMDUTF_POP_DISABLE_WARNINGS
+
+  if (force_implementation_name) {
+    auto force_implementation =
+        get_available_implementations()[force_implementation_name];
+    if (force_implementation) {
+      return get_active_implementation() = force_implementation;
+    } else {
+      // Note: abort() and stderr usage within the library is forbidden.
+      return get_active_implementation() = get_unsupported_singleton();
+    }
+  }
+  return get_active_implementation() =
+             get_available_implementations().detect_best_supported();
+}
+
+} // namespace internal
+
+/**
+ * The list of available implementations compiled into simdutf.
+ */
+SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list &
+get_available_implementations() {
+  static const internal::available_implementation_list
+      available_implementations{};
+  return available_implementations;
+}
+
+/**
+ * The active implementation.
+ */
+SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> &
+get_active_implementation() {
+#if SIMDUTF_SINGLE_IMPLEMENTATION
+  // skip runtime detection
+  static internal::atomic_ptr<const implementation> active_implementation{
+      internal::get_single_implementation()};
+  return active_implementation;
+#else
+  static const internal::detect_best_supported_implementation_on_first_use
+      detect_best_supported_implementation_on_first_use_singleton;
+  static internal::atomic_ptr<const implementation> active_implementation{
+      &detect_best_supported_implementation_on_first_use_singleton};
+  return active_implementation;
+#endif
+}
+
+#if SIMDUTF_SINGLE_IMPLEMENTATION
+const implementation *get_default_implementation() {
+  return internal::get_single_implementation();
+}
+#else
+internal::atomic_ptr<const implementation> &get_default_implementation() {
+  return get_active_implementation();
+}
+#endif
+#define SIMDUTF_GET_CURRENT_IMPLEMENTION
+
+simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept {
+  return get_default_implementation()->validate_utf8(buf, len);
+}
+simdutf_warn_unused result validate_utf8_with_errors(const char *buf,
+                                                     size_t len) noexcept {
+  return get_default_implementation()->validate_utf8_with_errors(buf, len);
+}
+simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept {
+  return get_default_implementation()->validate_ascii(buf, len);
+}
+simdutf_warn_unused result validate_ascii_with_errors(const char *buf,
+                                                      size_t len) noexcept {
+  return get_default_implementation()->validate_ascii_with_errors(buf, len);
+}
+simdutf_warn_unused size_t convert_utf8_to_utf16(
+    const char *input, size_t length, char16_t *utf16_output) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf8_to_utf16be(input, length, utf16_output);
+#else
+  return convert_utf8_to_utf16le(input, length, utf16_output);
+#endif
+}
+simdutf_warn_unused size_t convert_latin1_to_utf8(const char *buf, size_t len,
+                                                  char *utf8_output) noexcept {
+  return get_default_implementation()->convert_latin1_to_utf8(buf, len,
+                                                              utf8_output);
+}
+simdutf_warn_unused size_t convert_latin1_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) noexcept {
+  return get_default_implementation()->convert_latin1_to_utf16le(buf, len,
+                                                                 utf16_output);
+}
+simdutf_warn_unused size_t convert_latin1_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) noexcept {
+  return get_default_implementation()->convert_latin1_to_utf16be(buf, len,
+                                                                 utf16_output);
+}
+simdutf_warn_unused size_t convert_latin1_to_utf32(
+    const char *buf, size_t len, char32_t *latin1_output) noexcept {
+  return get_default_implementation()->convert_latin1_to_utf32(buf, len,
+                                                               latin1_output);
+}
+simdutf_warn_unused size_t convert_utf8_to_latin1(
+    const char *buf, size_t len, char *latin1_output) noexcept {
+  return get_default_implementation()->convert_utf8_to_latin1(buf, len,
+                                                              latin1_output);
+}
+simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
+    const char *buf, size_t len, char *latin1_output) noexcept {
+  return get_default_implementation()->convert_utf8_to_latin1_with_errors(
+      buf, len, latin1_output);
+}
+simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
+    const char *buf, size_t len, char *latin1_output) noexcept {
+  return get_default_implementation()->convert_valid_utf8_to_latin1(
+      buf, len, latin1_output);
+}
+simdutf_warn_unused size_t convert_utf8_to_utf16le(
+    const char *input, size_t length, char16_t *utf16_output) noexcept {
+  return get_default_implementation()->convert_utf8_to_utf16le(input, length,
+                                                               utf16_output);
+}
+simdutf_warn_unused size_t convert_utf8_to_utf16be(
+    const char *input, size_t length, char16_t *utf16_output) noexcept {
+  return get_default_implementation()->convert_utf8_to_utf16be(input, length,
+                                                               utf16_output);
+}
+simdutf_warn_unused result convert_utf8_to_utf16_with_errors(
+    const char *input, size_t length, char16_t *utf16_output) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
+#else
+  return convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
+#endif
+}
+simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+    const char *input, size_t length, char16_t *utf16_output) noexcept {
+  return get_default_implementation()->convert_utf8_to_utf16le_with_errors(
+      input, length, utf16_output);
+}
+simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+    const char *input, size_t length, char16_t *utf16_output) noexcept {
+  return get_default_implementation()->convert_utf8_to_utf16be_with_errors(
+      input, length, utf16_output);
+}
+simdutf_warn_unused size_t convert_utf8_to_utf32(
+    const char *input, size_t length, char32_t *utf32_output) noexcept {
+  return get_default_implementation()->convert_utf8_to_utf32(input, length,
+                                                             utf32_output);
+}
+simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+    const char *input, size_t length, char32_t *utf32_output) noexcept {
+  return get_default_implementation()->convert_utf8_to_utf32_with_errors(
+      input, length, utf32_output);
+}
+simdutf_warn_unused bool validate_utf16(const char16_t *buf,
+                                        size_t len) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return validate_utf16be(buf, len);
+#else
+  return validate_utf16le(buf, len);
+#endif
+}
+simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
+                                          size_t len) noexcept {
+  return get_default_implementation()->validate_utf16le(buf, len);
+}
+simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
+                                          size_t len) noexcept {
+  return get_default_implementation()->validate_utf16be(buf, len);
+}
+simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf,
+                                                      size_t len) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return validate_utf16be_with_errors(buf, len);
+#else
+  return validate_utf16le_with_errors(buf, len);
+#endif
+}
+simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf,
+                                                        size_t len) noexcept {
+  return get_default_implementation()->validate_utf16le_with_errors(buf, len);
+}
+simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf,
+                                                        size_t len) noexcept {
+  return get_default_implementation()->validate_utf16be_with_errors(buf, len);
+}
+simdutf_warn_unused bool validate_utf32(const char32_t *buf,
+                                        size_t len) noexcept {
+  return get_default_implementation()->validate_utf32(buf, len);
+}
+simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf,
+                                                      size_t len) noexcept {
+  return get_default_implementation()->validate_utf32_with_errors(buf, len);
+}
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
+    const char *input, size_t length, char16_t *utf16_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
+#else
+  return convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+    const char *input, size_t length, char16_t *utf16_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf8_to_utf16le(
+      input, length, utf16_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+    const char *input, size_t length, char16_t *utf16_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf8_to_utf16be(
+      input, length, utf16_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+    const char *input, size_t length, char32_t *utf32_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf8_to_utf32(
+      input, length, utf32_buffer);
+}
+simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t *buf,
+                                                 size_t len,
+                                                 char *utf8_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf16be_to_utf8(buf, len, utf8_buffer);
+#else
+  return convert_utf16le_to_utf8(buf, len, utf8_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_utf16_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf16be_to_latin1(buf, len, latin1_buffer);
+#else
+  return convert_utf16le_to_latin1(buf, len, latin1_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_latin1_to_utf16(
+    const char *buf, size_t len, char16_t *utf16_output) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_latin1_to_utf16be(buf, len, utf16_output);
+#else
+  return convert_latin1_to_utf16le(buf, len, utf16_output);
+#endif
+}
+simdutf_warn_unused size_t convert_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+  return get_default_implementation()->convert_utf16be_to_latin1(buf, len,
+                                                                 latin1_buffer);
+}
+simdutf_warn_unused size_t convert_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+  return get_default_implementation()->convert_utf16le_to_latin1(buf, len,
+                                                                 latin1_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf16be_to_latin1(
+      buf, len, latin1_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf16le_to_latin1(
+      buf, len, latin1_buffer);
+}
+simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+  return get_default_implementation()->convert_utf16le_to_latin1_with_errors(
+      buf, len, latin1_buffer);
+}
+simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+  return get_default_implementation()->convert_utf16be_to_latin1_with_errors(
+      buf, len, latin1_buffer);
+}
+simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *buf,
+                                                   size_t len,
+                                                   char *utf8_buffer) noexcept {
+  return get_default_implementation()->convert_utf16le_to_utf8(buf, len,
+                                                               utf8_buffer);
+}
+simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *buf,
+                                                   size_t len,
+                                                   char *utf8_buffer) noexcept {
+  return get_default_implementation()->convert_utf16be_to_utf8(buf, len,
+                                                               utf8_buffer);
+}
+simdutf_warn_unused result convert_utf16_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
+#else
+  return convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
+#endif
+}
+simdutf_warn_unused result convert_utf16_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf16be_to_latin1_with_errors(buf, len, latin1_buffer);
+#else
+  return convert_utf16le_to_latin1_with_errors(buf, len, latin1_buffer);
+#endif
+}
+simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
+  return get_default_implementation()->convert_utf16le_to_utf8_with_errors(
+      buf, len, utf8_buffer);
+}
+simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
+  return get_default_implementation()->convert_utf16be_to_utf8_with_errors(
+      buf, len, utf8_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
+#else
+  return convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_valid_utf16_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_valid_utf16be_to_latin1(buf, len, latin1_buffer);
+#else
+  return convert_valid_utf16le_to_latin1(buf, len, latin1_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf16le_to_utf8(
+      buf, len, utf8_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf16be_to_utf8(
+      buf, len, utf8_buffer);
+}
+simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t *buf,
+                                                 size_t len,
+                                                 char *utf8_buffer) noexcept {
+  return get_default_implementation()->convert_utf32_to_utf8(buf, len,
+                                                             utf8_buffer);
+}
+simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+    const char32_t *buf, size_t len, char *utf8_buffer) noexcept {
+  return get_default_implementation()->convert_utf32_to_utf8_with_errors(
+      buf, len, utf8_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf32_to_utf8(buf, len,
+                                                                   utf8_buffer);
+}
+simdutf_warn_unused size_t convert_utf32_to_utf16(
+    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf32_to_utf16be(buf, len, utf16_buffer);
+#else
+  return convert_utf32_to_utf16le(buf, len, utf16_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_utf32_to_latin1(
+    const char32_t *input, size_t length, char *latin1_output) noexcept {
+  return get_default_implementation()->convert_utf32_to_latin1(input, length,
+                                                               latin1_output);
+}
+simdutf_warn_unused size_t convert_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+  return get_default_implementation()->convert_utf32_to_utf16le(buf, len,
+                                                                utf16_buffer);
+}
+simdutf_warn_unused size_t convert_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+  return get_default_implementation()->convert_utf32_to_utf16be(buf, len,
+                                                                utf16_buffer);
+}
+simdutf_warn_unused result convert_utf32_to_utf16_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
+#else
+  return convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
+#endif
+}
+simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+  return get_default_implementation()->convert_utf32_to_utf16le_with_errors(
+      buf, len, utf16_buffer);
+}
+simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+  return get_default_implementation()->convert_utf32_to_utf16be_with_errors(
+      buf, len, utf16_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16(
+    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
+#else
+  return convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf32_to_utf16le(
+      buf, len, utf16_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf32_to_utf16be(
+      buf, len, utf16_buffer);
+}
+simdutf_warn_unused size_t convert_utf16_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf16be_to_utf32(buf, len, utf32_buffer);
+#else
+  return convert_utf16le_to_utf32(buf, len, utf32_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+  return get_default_implementation()->convert_utf16le_to_utf32(buf, len,
+                                                                utf32_buffer);
+}
+simdutf_warn_unused size_t convert_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+  return get_default_implementation()->convert_utf16be_to_utf32(buf, len,
+                                                                utf32_buffer);
+}
+simdutf_warn_unused result convert_utf16_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
+#else
+  return convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
+#endif
+}
+simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+  return get_default_implementation()->convert_utf16le_to_utf32_with_errors(
+      buf, len, utf32_buffer);
+}
+simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+  return get_default_implementation()->convert_utf16be_to_utf32_with_errors(
+      buf, len, utf32_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf16_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
+#else
+  return convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf16le_to_utf32(
+      buf, len, utf32_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf16be_to_utf32(
+      buf, len, utf32_buffer);
+}
+void change_endianness_utf16(const char16_t *input, size_t length,
+                             char16_t *output) noexcept {
+  get_default_implementation()->change_endianness_utf16(input, length, output);
+}
+simdutf_warn_unused size_t count_utf16(const char16_t *input,
+                                       size_t length) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return count_utf16be(input, length);
+#else
+  return count_utf16le(input, length);
+#endif
+}
+simdutf_warn_unused size_t count_utf16le(const char16_t *input,
+                                         size_t length) noexcept {
+  return get_default_implementation()->count_utf16le(input, length);
+}
+simdutf_warn_unused size_t count_utf16be(const char16_t *input,
+                                         size_t length) noexcept {
+  return get_default_implementation()->count_utf16be(input, length);
+}
+simdutf_warn_unused size_t count_utf8(const char *input,
+                                      size_t length) noexcept {
+  return get_default_implementation()->count_utf8(input, length);
+}
+simdutf_warn_unused size_t latin1_length_from_utf8(const char *buf,
+                                                   size_t len) noexcept {
+  return get_default_implementation()->latin1_length_from_utf8(buf, len);
+}
+simdutf_warn_unused size_t latin1_length_from_utf16(size_t len) noexcept {
+  return get_default_implementation()->latin1_length_from_utf16(len);
+}
+simdutf_warn_unused size_t latin1_length_from_utf32(size_t len) noexcept {
+  return get_default_implementation()->latin1_length_from_utf32(len);
+}
+simdutf_warn_unused size_t utf8_length_from_latin1(const char *buf,
+                                                   size_t len) noexcept {
+  return get_default_implementation()->utf8_length_from_latin1(buf, len);
+}
+simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t *input,
+                                                  size_t length) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return utf8_length_from_utf16be(input, length);
+#else
+  return utf8_length_from_utf16le(input, length);
+#endif
+}
+simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *input,
+                                                    size_t length) noexcept {
+  return get_default_implementation()->utf8_length_from_utf16le(input, length);
+}
+simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *input,
+                                                    size_t length) noexcept {
+  return get_default_implementation()->utf8_length_from_utf16be(input, length);
+}
+simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t *input,
+                                                   size_t length) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return utf32_length_from_utf16be(input, length);
+#else
+  return utf32_length_from_utf16le(input, length);
+#endif
+}
+simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *input,
+                                                     size_t length) noexcept {
+  return get_default_implementation()->utf32_length_from_utf16le(input, length);
+}
+simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *input,
+                                                     size_t length) noexcept {
+  return get_default_implementation()->utf32_length_from_utf16be(input, length);
+}
+simdutf_warn_unused size_t utf16_length_from_utf8(const char *input,
+                                                  size_t length) noexcept {
+  return get_default_implementation()->utf16_length_from_utf8(input, length);
+}
+simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) noexcept {
+  return get_default_implementation()->utf16_length_from_latin1(length);
+}
+simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *input,
+                                                  size_t length) noexcept {
+  return get_default_implementation()->utf8_length_from_utf32(input, length);
+}
+simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *input,
+                                                   size_t length) noexcept {
+  return get_default_implementation()->utf16_length_from_utf32(input, length);
+}
+simdutf_warn_unused size_t utf32_length_from_utf8(const char *input,
+                                                  size_t length) noexcept {
+  return get_default_implementation()->utf32_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t
+maximal_binary_length_from_base64(const char *input, size_t length) noexcept {
+  return get_default_implementation()->maximal_binary_length_from_base64(
+      input, length);
+}
+
+simdutf_warn_unused result base64_to_binary(
+    const char *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_handling_options) noexcept {
+  return get_default_implementation()->base64_to_binary(
+      input, length, output, options, last_chunk_handling_options);
+}
+
+simdutf_warn_unused size_t maximal_binary_length_from_base64(
+    const char16_t *input, size_t length) noexcept {
+  return get_default_implementation()->maximal_binary_length_from_base64(
+      input, length);
+}
+
+simdutf_warn_unused result base64_to_binary(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_handling_options) noexcept {
+  return get_default_implementation()->base64_to_binary(
+      input, length, output, options, last_chunk_handling_options);
+}
+
+template <typename chartype>
+simdutf_warn_unused result base64_to_binary_safe_impl(
+    const chartype *input, size_t length, char *output, size_t &outlen,
+    base64_options options,
+    last_chunk_handling_options last_chunk_handling_options) noexcept {
+  static_assert(std::is_same<chartype, char>::value ||
+                    std::is_same<chartype, char16_t>::value,
+                "Only char and char16_t are supported.");
+  // The implementation could be nicer, but we expect that most times, the user
+  // will provide us with a buffer that is large enough.
+  size_t max_length = maximal_binary_length_from_base64(input, length);
+  if (outlen >= max_length) {
+    // fast path
+    full_result r = get_default_implementation()->base64_to_binary_details(
+        input, length, output, options, last_chunk_handling_options);
+    if (r.error != error_code::INVALID_BASE64_CHARACTER &&
+        r.error != error_code::BASE64_EXTRA_BITS) {
+      outlen = r.output_count;
+      if (last_chunk_handling_options == stop_before_partial) {
+        if ((r.output_count % 3) != 0) {
+          bool empty_trail = true;
+          for (size_t i = r.input_count; i < length; i++) {
+            if (!scalar::base64::is_ascii_white_space_or_padding(input[i])) {
+              empty_trail = false;
+              break;
+            }
+          }
+          if (empty_trail) {
+            r.input_count = length;
+          }
+        }
+        return {r.error, r.input_count};
+      }
+      return {r.error, length};
+    }
+    return r;
+  }
+  // The output buffer is maybe too small. We will decode a truncated version of
+  // the input.
+  size_t outlen3 = outlen / 3 * 3; // round down to multiple of 3
+  size_t safe_input = base64_length_from_binary(outlen3, options);
+  full_result r = get_default_implementation()->base64_to_binary_details(
+      input, safe_input, output, options, loose);
+  if (r.error == error_code::INVALID_BASE64_CHARACTER) {
+    return r;
+  }
+  size_t offset =
+      (r.error == error_code::BASE64_INPUT_REMAINDER)
+          ? 1
+          : ((r.output_count % 3) == 0 ? 0 : (r.output_count % 3) + 1);
+  size_t output_index = r.output_count - (r.output_count % 3);
+  size_t input_index = safe_input;
+  // offset is a value that is no larger than 3. We backtrack
+  // by up to offset characters + an undetermined number of
+  // white space characters. It is expected that the next loop
+  // runs at most 3 times + the number of white space characters
+  // in between them, so we are not worried about performance.
+  while (offset > 0 && input_index > 0) {
+    chartype c = input[--input_index];
+    if (scalar::base64::is_ascii_white_space(c)) {
+      // skipping
+    } else {
+      offset--;
+    }
+  }
+  size_t remaining_out = outlen - output_index;
+  const chartype *tail_input = input + input_index;
+  size_t tail_length = length - input_index;
+  while (tail_length > 0 &&
+         scalar::base64::is_ascii_white_space(tail_input[tail_length - 1])) {
+    tail_length--;
+  }
+  size_t padding_characts = 0;
+  if (tail_length > 0 && tail_input[tail_length - 1] == '=') {
+    tail_length--;
+    padding_characts++;
+    while (tail_length > 0 &&
+           scalar::base64::is_ascii_white_space(tail_input[tail_length - 1])) {
+      tail_length--;
+    }
+    if (tail_length > 0 && tail_input[tail_length - 1] == '=') {
+      tail_length--;
+      padding_characts++;
+    }
+  }
+  // this will advance tail_input and tail_length
+  result rr = scalar::base64::base64_tail_decode_safe(
+      output + output_index, remaining_out, tail_input, tail_length,
+      padding_characts, options, last_chunk_handling_options);
+  outlen = output_index + remaining_out;
+  if (last_chunk_handling_options != stop_before_partial &&
+      rr.error == error_code::SUCCESS && padding_characts > 0) {
+    // additional checks
+    if ((outlen % 3 == 0) || ((outlen % 3) + 1 + padding_characts != 4)) {
+      rr.error = error_code::INVALID_BASE64_CHARACTER;
+    }
+  }
+  if (rr.error == error_code::SUCCESS &&
+      last_chunk_handling_options == stop_before_partial) {
+    if (tail_input > input + input_index) {
+      rr.count = tail_input - input;
+    } else if (r.input_count > 0) {
+      rr.count = r.input_count + rr.count;
+    }
+    return rr;
+  }
+  rr.count += input_index;
+  return rr;
+}
+
+simdutf_warn_unused size_t convert_latin1_to_utf8_safe(
+    const char *buf, size_t len, char *utf8_output, size_t utf8_len) noexcept {
+  const auto start{utf8_output};
+
+  while (true) {
+    // convert_latin1_to_utf8 will never write more than input length * 2
+    auto read_len = std::min(len, utf8_len >> 1);
+    if (read_len <= 16) {
+      break;
+    }
+
+    const auto write_len =
+        simdutf::convert_latin1_to_utf8(buf, read_len, utf8_output);
+
+    utf8_output += write_len;
+    utf8_len -= write_len;
+    buf += read_len;
+    len -= read_len;
+  }
+
+  utf8_output +=
+      scalar::latin1_to_utf8::convert_safe(buf, len, utf8_output, utf8_len);
+
+  return utf8_output - start;
+}
+
+simdutf_warn_unused result base64_to_binary_safe(
+    const char *input, size_t length, char *output, size_t &outlen,
+    base64_options options,
+    last_chunk_handling_options last_chunk_handling_options) noexcept {
+  return base64_to_binary_safe_impl<char>(input, length, output, outlen,
+                                          options, last_chunk_handling_options);
+}
+simdutf_warn_unused result base64_to_binary_safe(
+    const char16_t *input, size_t length, char *output, size_t &outlen,
+    base64_options options,
+    last_chunk_handling_options last_chunk_handling_options) noexcept {
+  return base64_to_binary_safe_impl<char16_t>(
+      input, length, output, outlen, options, last_chunk_handling_options);
+}
+
+simdutf_warn_unused size_t
+base64_length_from_binary(size_t length, base64_options options) noexcept {
+  return get_default_implementation()->base64_length_from_binary(length,
+                                                                 options);
+}
+
+size_t binary_to_base64(const char *input, size_t length, char *output,
+                        base64_options options) noexcept {
+  return get_default_implementation()->binary_to_base64(input, length, output,
+                                                        options);
+}
+
+simdutf_warn_unused simdutf::encoding_type
+autodetect_encoding(const char *buf, size_t length) noexcept {
+  return get_default_implementation()->autodetect_encoding(buf, length);
+}
+simdutf_warn_unused int detect_encodings(const char *buf,
+                                         size_t length) noexcept {
+  return get_default_implementation()->detect_encodings(buf, length);
+}
+const implementation *builtin_implementation() {
+  static const implementation *builtin_impl =
+      get_available_implementations()[SIMDUTF_STRINGIFY(
+          SIMDUTF_BUILTIN_IMPLEMENTATION)];
+  return builtin_impl;
+}
+
+simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length) {
+  return scalar::utf8::trim_partial_utf8(input, length);
+}
+
+simdutf_warn_unused size_t trim_partial_utf16be(const char16_t *input,
+                                                size_t length) {
+  return scalar::utf16::trim_partial_utf16<BIG>(input, length);
+}
+
+simdutf_warn_unused size_t trim_partial_utf16le(const char16_t *input,
+                                                size_t length) {
+  return scalar::utf16::trim_partial_utf16<LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t trim_partial_utf16(const char16_t *input,
+                                              size_t length) {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return trim_partial_utf16be(input, length);
+#else
+  return trim_partial_utf16le(input, length);
+#endif
+}
+
+} // namespace simdutf
+/* end file src/implementation.cpp */
+/* begin file src/encoding_types.cpp */
+
+namespace simdutf {
+bool match_system(endianness e) {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return e == endianness::BIG;
+#else
+  return e == endianness::LITTLE;
+#endif
+}
+
+std::string to_string(encoding_type bom) {
+  switch (bom) {
+  case UTF16_LE:
+    return "UTF16 little-endian";
+  case UTF16_BE:
+    return "UTF16 big-endian";
+  case UTF32_LE:
+    return "UTF32 little-endian";
+  case UTF32_BE:
+    return "UTF32 big-endian";
+  case UTF8:
+    return "UTF8";
+  case unspecified:
+    return "unknown";
+  default:
+    return "error";
+  }
+}
+
+namespace BOM {
+// Note that BOM for UTF8 is discouraged.
+encoding_type check_bom(const uint8_t *byte, size_t length) {
+  if (length >= 2 && byte[0] == 0xff and byte[1] == 0xfe) {
+    if (length >= 4 && byte[2] == 0x00 and byte[3] == 0x0) {
+      return encoding_type::UTF32_LE;
+    } else {
+      return encoding_type::UTF16_LE;
+    }
+  } else if (length >= 2 && byte[0] == 0xfe and byte[1] == 0xff) {
+    return encoding_type::UTF16_BE;
+  } else if (length >= 4 && byte[0] == 0x00 and byte[1] == 0x00 and
+             byte[2] == 0xfe and byte[3] == 0xff) {
+    return encoding_type::UTF32_BE;
+  } else if (length >= 4 && byte[0] == 0xef and byte[1] == 0xbb and
+             byte[2] == 0xbf) {
+    return encoding_type::UTF8;
+  }
+  return encoding_type::unspecified;
+}
+
+encoding_type check_bom(const char *byte, size_t length) {
+  return check_bom(reinterpret_cast<const uint8_t *>(byte), length);
+}
+
+size_t bom_byte_size(encoding_type bom) {
+  switch (bom) {
+  case UTF16_LE:
+    return 2;
+  case UTF16_BE:
+    return 2;
+  case UTF32_LE:
+    return 4;
+  case UTF32_BE:
+    return 4;
+  case UTF8:
+    return 3;
+  case unspecified:
+    return 0;
+  default:
+    return 0;
+  }
+}
+
+} // namespace BOM
+} // namespace simdutf
+/* end file src/encoding_types.cpp */
+/* begin file src/error.cpp */
+namespace simdutf {
+// deliberately empty
+}
+/* end file src/error.cpp */
+// The large tables should be included once and they
+// should not depend on a kernel.
+/* begin file src/tables/utf8_to_utf16_tables.h */
+#ifndef SIMDUTF_UTF8_TO_UTF16_TABLES_H
+#define SIMDUTF_UTF8_TO_UTF16_TABLES_H
+#include <cstdint>
+
+namespace simdutf {
+namespace {
+namespace tables {
+namespace utf8_to_utf16 {
+/**
+ * utf8bigindex uses about 8 kB
+ * shufutf8 uses about 3344 B
+ *
+ * So we use a bit over 11 kB. It would be
+ * easy to save about 4 kB by only
+ * storing the index in utf8bigindex, and
+ * deriving the consumed bytes otherwise.
+ * However, this may come at a significant (10% to 20%)
+ * performance penalty.
+ */
+
+const uint8_t shufutf8[209][16] = {
+    {0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 0, 0, 0, 0},
+    {0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 6, 5, 0, 0, 0, 0},
+    {0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 6, 255, 0, 0, 0, 0},
+    {0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 7, 6, 0, 0, 0, 0},
+    {0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 6, 255, 0, 0, 0, 0},
+    {0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 7, 6, 0, 0, 0, 0},
+    {0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 7, 255, 0, 0, 0, 0},
+    {0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 8, 7, 0, 0, 0, 0},
+    {0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
+    {0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
+    {0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
+    {0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
+    {0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
+    {0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
+    {0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
+    {0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
+    {0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
+    {0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
+    {0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
+    {0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
+    {0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
+    {0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
+    {0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
+    {0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
+    {0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
+    {0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
+    {0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
+    {0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
+    {0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
+    {0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
+    {0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
+    {0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
+    {1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
+    {1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
+    {1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
+    {1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
+    {1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
+    {1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
+    {1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
+    {1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
+    {1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
+    {1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
+    {1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
+    {1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
+    {1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
+    {1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
+    {1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
+    {1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
+    {1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
+    {1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
+    {1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
+    {1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
+    {1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
+    {1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
+    {1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
+    {1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
+    {1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 8, 255, 0, 0, 0, 0},
+    {1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 9, 8, 0, 0, 0, 0},
+    {1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 9, 255, 0, 0, 0, 0},
+    {1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 10, 9, 0, 0, 0, 0},
+    {1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 9, 255, 0, 0, 0, 0},
+    {1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 10, 9, 0, 0, 0, 0},
+    {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 255, 0, 0, 0, 0},
+    {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 0, 0, 0, 0},
+    {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255},
+    {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255},
+    {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255},
+    {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255},
+    {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255},
+    {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255},
+    {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255},
+    {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255},
+    {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255},
+    {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
+    {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
+    {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
+    {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
+    {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
+    {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
+    {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
+    {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
+    {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
+    {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 5, 255, 255, 255},
+    {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 6, 5, 255, 255},
+    {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 7, 6, 5, 255},
+    {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 6, 255, 255, 255},
+    {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 7, 6, 255, 255},
+    {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 8, 7, 6, 255},
+    {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 7, 255, 255, 255},
+    {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 8, 7, 255, 255},
+    {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 9, 8, 7, 255},
+    {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
+    {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
+    {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
+    {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
+    {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
+    {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
+    {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
+    {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
+    {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
+    {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
+    {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
+    {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
+    {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
+    {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
+    {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
+    {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
+    {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
+    {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
+    {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 6, 255, 255, 255},
+    {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 7, 6, 255, 255},
+    {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 8, 7, 6, 255},
+    {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 7, 255, 255, 255},
+    {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 8, 7, 255, 255},
+    {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 9, 8, 7, 255},
+    {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 8, 255, 255, 255},
+    {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 9, 8, 255, 255},
+    {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 10, 9, 8, 255},
+    {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
+    {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
+    {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
+    {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
+    {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
+    {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
+    {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
+    {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
+    {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
+    {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255},
+    {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 7, 6, 255, 255},
+    {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 8, 7, 6, 255},
+    {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 7, 255, 255, 255},
+    {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 8, 7, 255, 255},
+    {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 9, 8, 7, 255},
+    {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 8, 255, 255, 255},
+    {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 9, 8, 255, 255},
+    {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 10, 9, 8, 255},
+    {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 7, 255, 255, 255},
+    {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 8, 7, 255, 255},
+    {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 9, 8, 7, 255},
+    {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 8, 255, 255, 255},
+    {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 9, 8, 255, 255},
+    {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 10, 9, 8, 255},
+    {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 9, 255, 255, 255},
+    {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 10, 9, 255, 255},
+    {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255},
+    {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 1, 255, 255, 255, 5, 4, 3, 2, 0, 0, 0, 0},
+    {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 2, 1, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
+    {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 3, 2, 1, 255, 7, 6, 5, 4, 0, 0, 0, 0},
+    {0, 255, 255, 255, 4, 3, 2, 1, 5, 255, 255, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 4, 3, 2, 1, 6, 5, 255, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 4, 3, 2, 1, 7, 6, 5, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 4, 3, 2, 1, 8, 7, 6, 5, 0, 0, 0, 0},
+    {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 2, 255, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
+    {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 3, 2, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
+    {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 4, 3, 2, 255, 8, 7, 6, 5, 0, 0, 0, 0},
+    {1, 0, 255, 255, 5, 4, 3, 2, 6, 255, 255, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 5, 4, 3, 2, 7, 6, 255, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 5, 4, 3, 2, 8, 7, 6, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 5, 4, 3, 2, 9, 8, 7, 6, 0, 0, 0, 0},
+    {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 3, 255, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
+    {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 4, 3, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
+    {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 5, 4, 3, 255, 9, 8, 7, 6, 0, 0, 0, 0},
+    {2, 1, 0, 255, 6, 5, 4, 3, 7, 255, 255, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 6, 5, 4, 3, 8, 7, 255, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 6, 5, 4, 3, 9, 8, 7, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 6, 5, 4, 3, 10, 9, 8, 7, 0, 0, 0, 0},
+    {3, 2, 1, 0, 4, 255, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 4, 255, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 4, 255, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 4, 255, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
+    {3, 2, 1, 0, 5, 4, 255, 255, 6, 255, 255, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 5, 4, 255, 255, 7, 6, 255, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 5, 4, 255, 255, 8, 7, 6, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 5, 4, 255, 255, 9, 8, 7, 6, 0, 0, 0, 0},
+    {3, 2, 1, 0, 6, 5, 4, 255, 7, 255, 255, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 6, 5, 4, 255, 8, 7, 255, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 6, 5, 4, 255, 9, 8, 7, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 6, 5, 4, 255, 10, 9, 8, 7, 0, 0, 0, 0},
+    {3, 2, 1, 0, 7, 6, 5, 4, 8, 255, 255, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 7, 6, 5, 4, 9, 8, 255, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 7, 6, 5, 4, 10, 9, 8, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 0, 0, 0, 0}};
+/* number of two bytes : 64 */
+/* number of two + three bytes : 145 */
+/* number of two + three + four bytes : 209 */
+const uint8_t utf8bigindex[4096][2] = {
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},
+    {161, 4},  {64, 4},   {209, 12}, {209, 12}, {209, 12}, {147, 5},  {209, 12},
+    {150, 5},  {162, 5},  {65, 5},   {209, 12}, {153, 5},  {165, 5},  {67, 5},
+    {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {209, 12},
+    {148, 6},  {209, 12}, {151, 6},  {163, 6},  {66, 6},   {209, 12}, {154, 6},
+    {166, 6},  {68, 6},   {178, 6},  {74, 6},   {92, 6},   {64, 4},   {209, 12},
+    {157, 6},  {169, 6},  {70, 6},   {181, 6},  {76, 6},   {94, 6},   {65, 5},
+    {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},
+    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {152, 7},
+    {164, 7},  {145, 3},  {209, 12}, {155, 7},  {167, 7},  {69, 7},   {179, 7},
+    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {170, 7},  {71, 7},
+    {182, 7},  {77, 7},   {95, 7},   {65, 5},   {194, 7},  {83, 7},   {101, 7},
+    {67, 5},   {119, 7},  {73, 5},   {91, 5},   {1, 7},    {209, 12}, {209, 12},
+    {173, 7},  {148, 6},  {185, 7},  {79, 7},   {97, 7},   {66, 6},   {197, 7},
+    {85, 7},   {103, 7},  {68, 6},   {121, 7},  {74, 6},   {92, 6},   {2, 7},
+    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {76, 6},   {94, 6},
+    {4, 7},    {193, 6},  {82, 6},   {100, 6},  {8, 7},    {118, 6},  {16, 7},
+    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {145, 3},  {209, 12}, {156, 8},  {168, 8},  {146, 4},
+    {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {171, 8},
+    {72, 8},   {183, 8},  {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},
+    {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
+    {209, 12}, {174, 8},  {148, 6},  {186, 8},  {80, 8},   {98, 8},   {66, 6},
+    {198, 8},  {86, 8},   {104, 8},  {68, 6},   {122, 8},  {74, 6},   {92, 6},
+    {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {76, 6},
+    {94, 6},   {5, 8},    {193, 6},  {82, 6},   {100, 6},  {9, 8},    {118, 6},
+    {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},
+    {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
+    {112, 8},  {71, 7},   {130, 8},  {77, 7},   {95, 7},   {6, 8},    {194, 7},
+    {83, 7},   {101, 7},  {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},
+    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},
+    {66, 6},   {197, 7},  {85, 7},   {103, 7},  {12, 8},   {121, 7},  {20, 8},
+    {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
+    {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},
+    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12},
+    {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12},
+    {160, 9},  {172, 9},  {147, 5},  {184, 9},  {150, 5},  {162, 5},  {65, 5},
+    {196, 9},  {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},
+    {64, 4},   {209, 12}, {209, 12}, {175, 9},  {148, 6},  {187, 9},  {81, 9},
+    {99, 9},   {66, 6},   {199, 9},  {87, 9},   {105, 9},  {68, 6},   {123, 9},
+    {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {111, 9},  {70, 6},
+    {129, 9},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},
+    {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {190, 9},  {152, 7},  {164, 7},  {145, 3},  {202, 9},
+    {89, 9},   {107, 9},  {69, 7},   {125, 9},  {75, 7},   {93, 7},   {64, 4},
+    {209, 12}, {158, 7},  {113, 9},  {71, 7},   {131, 9},  {77, 7},   {95, 7},
+    {7, 9},    {194, 7},  {83, 7},   {101, 7},  {11, 9},   {119, 7},  {19, 9},
+    {35, 9},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {137, 9},
+    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},  {13, 9},
+    {121, 7},  {21, 9},   {37, 9},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
+    {70, 6},   {127, 7},  {25, 9},   {41, 9},   {4, 7},    {193, 6},  {82, 6},
+    {49, 9},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
+    {205, 9},  {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},
+    {64, 4},   {209, 12}, {159, 8},  {115, 9},  {72, 8},   {133, 9},  {78, 8},
+    {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},
+    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},
+    {139, 9},  {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {104, 8},
+    {14, 9},   {122, 8},  {22, 9},   {38, 9},   {3, 8},    {209, 12}, {157, 6},
+    {110, 8},  {70, 6},   {128, 8},  {26, 9},   {42, 9},   {5, 8},    {193, 6},
+    {82, 6},   {50, 9},   {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},
+    {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},
+    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},
+    {28, 9},   {44, 9},   {6, 8},    {194, 7},  {83, 7},   {52, 9},   {10, 8},
+    {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
+    {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
+    {56, 9},   {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12},
+    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},
+    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},
+    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12},
+    {149, 4},  {161, 4},  {64, 4},   {209, 12}, {209, 12}, {209, 12}, {147, 5},
+    {209, 12}, {150, 5},  {162, 5},  {65, 5},   {209, 12}, {153, 5},  {165, 5},
+    {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12},
+    {176, 10}, {148, 6},  {188, 10}, {151, 6},  {163, 6},  {66, 6},   {200, 10},
+    {154, 6},  {166, 6},  {68, 6},   {178, 6},  {74, 6},   {92, 6},   {64, 4},
+    {209, 12}, {157, 6},  {169, 6},  {70, 6},   {181, 6},  {76, 6},   {94, 6},
+    {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},
+    {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {191, 10},
+    {152, 7},  {164, 7},  {145, 3},  {203, 10}, {90, 10},  {108, 10}, {69, 7},
+    {126, 10}, {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {114, 10},
+    {71, 7},   {132, 10}, {77, 7},   {95, 7},   {65, 5},   {194, 7},  {83, 7},
+    {101, 7},  {67, 5},   {119, 7},  {73, 5},   {91, 5},   {1, 7},    {209, 12},
+    {209, 12}, {173, 7},  {148, 6},  {138, 10}, {79, 7},   {97, 7},   {66, 6},
+    {197, 7},  {85, 7},   {103, 7},  {68, 6},   {121, 7},  {74, 6},   {92, 6},
+    {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {76, 6},
+    {94, 6},   {4, 7},    {193, 6},  {82, 6},   {100, 6},  {8, 7},    {118, 6},
+    {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {145, 3},  {206, 10}, {156, 8},  {168, 8},
+    {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},
+    {116, 10}, {72, 8},   {134, 10}, {78, 8},   {96, 8},   {65, 5},   {195, 8},
+    {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},
+    {209, 12}, {209, 12}, {174, 8},  {148, 6},  {140, 10}, {80, 8},   {98, 8},
+    {66, 6},   {198, 8},  {86, 8},   {104, 8},  {15, 10},  {122, 8},  {23, 10},
+    {39, 10},  {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},
+    {27, 10},  {43, 10},  {5, 8},    {193, 6},  {82, 6},   {51, 10},  {9, 8},
+    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},
+    {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12},
+    {158, 7},  {112, 8},  {71, 7},   {130, 8},  {29, 10},  {45, 10},  {6, 8},
+    {194, 7},  {83, 7},   {53, 10},  {10, 8},   {119, 7},  {18, 8},   {34, 8},
+    {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},
+    {97, 7},   {66, 6},   {197, 7},  {85, 7},   {57, 10},  {12, 8},   {121, 7},
+    {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},
+    {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},
+    {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12},
+    {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},
+    {209, 12}, {160, 9},  {172, 9},  {147, 5},  {184, 9},  {150, 5},  {162, 5},
+    {65, 5},   {196, 9},  {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},
+    {91, 5},   {64, 4},   {209, 12}, {209, 12}, {175, 9},  {148, 6},  {142, 10},
+    {81, 9},   {99, 9},   {66, 6},   {199, 9},  {87, 9},   {105, 9},  {68, 6},
+    {123, 9},  {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {111, 9},
+    {70, 6},   {129, 9},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},
+    {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {190, 9},  {152, 7},  {164, 7},  {145, 3},
+    {202, 9},  {89, 9},   {107, 9},  {69, 7},   {125, 9},  {75, 7},   {93, 7},
+    {64, 4},   {209, 12}, {158, 7},  {113, 9},  {71, 7},   {131, 9},  {30, 10},
+    {46, 10},  {7, 9},    {194, 7},  {83, 7},   {54, 10},  {11, 9},   {119, 7},
+    {19, 9},   {35, 9},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},
+    {137, 9},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {58, 10},
+    {13, 9},   {121, 7},  {21, 9},   {37, 9},   {2, 7},    {209, 12}, {157, 6},
+    {109, 7},  {70, 6},   {127, 7},  {25, 9},   {41, 9},   {4, 7},    {193, 6},
+    {82, 6},   {49, 9},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {145, 3},  {205, 9},  {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},
+    {161, 4},  {64, 4},   {209, 12}, {159, 8},  {115, 9},  {72, 8},   {133, 9},
+    {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},
+    {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},
+    {148, 6},  {139, 9},  {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},
+    {60, 10},  {14, 9},   {122, 8},  {22, 9},   {38, 9},   {3, 8},    {209, 12},
+    {157, 6},  {110, 8},  {70, 6},   {128, 8},  {26, 9},   {42, 9},   {5, 8},
+    {193, 6},  {82, 6},   {50, 9},   {9, 8},    {118, 6},  {17, 8},   {33, 8},
+    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},
+    {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},
+    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},
+    {130, 8},  {28, 9},   {44, 9},   {6, 8},    {194, 7},  {83, 7},   {52, 9},
+    {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12},
+    {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},
+    {85, 7},   {56, 9},   {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},
+    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},
+    {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},
+    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},
+    {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12}, {209, 12}, {209, 12},
+    {147, 5},  {209, 12}, {150, 5},  {162, 5},  {65, 5},   {209, 12}, {153, 5},
+    {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
+    {209, 12}, {209, 12}, {148, 6},  {209, 12}, {151, 6},  {163, 6},  {66, 6},
+    {209, 12}, {154, 6},  {166, 6},  {68, 6},   {178, 6},  {74, 6},   {92, 6},
+    {64, 4},   {209, 12}, {157, 6},  {169, 6},  {70, 6},   {181, 6},  {76, 6},
+    {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},
+    {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {192, 11}, {152, 7},  {164, 7},  {145, 3},  {204, 11}, {155, 7},  {167, 7},
+    {69, 7},   {179, 7},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
+    {170, 7},  {71, 7},   {182, 7},  {77, 7},   {95, 7},   {65, 5},   {194, 7},
+    {83, 7},   {101, 7},  {67, 5},   {119, 7},  {73, 5},   {91, 5},   {1, 7},
+    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {185, 7},  {79, 7},   {97, 7},
+    {66, 6},   {197, 7},  {85, 7},   {103, 7},  {68, 6},   {121, 7},  {74, 6},
+    {92, 6},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
+    {76, 6},   {94, 6},   {4, 7},    {193, 6},  {82, 6},   {100, 6},  {8, 7},
+    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {207, 11}, {156, 8},
+    {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12},
+    {159, 8},  {117, 11}, {72, 8},   {135, 11}, {78, 8},   {96, 8},   {65, 5},
+    {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},
+    {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},  {141, 11}, {80, 8},
+    {98, 8},   {66, 6},   {198, 8},  {86, 8},   {104, 8},  {68, 6},   {122, 8},
+    {74, 6},   {92, 6},   {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},
+    {128, 8},  {76, 6},   {94, 6},   {5, 8},    {193, 6},  {82, 6},   {100, 6},
+    {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},
+    {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},
+    {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},  {77, 7},   {95, 7},
+    {6, 8},    {194, 7},  {83, 7},   {101, 7},  {10, 8},   {119, 7},  {18, 8},
+    {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},
+    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},  {12, 8},
+    {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
+    {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},
+    {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
+    {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},
+    {64, 4},   {209, 12}, {160, 9},  {172, 9},  {147, 5},  {184, 9},  {150, 5},
+    {162, 5},  {65, 5},   {196, 9},  {153, 5},  {165, 5},  {67, 5},   {177, 5},
+    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {175, 9},  {148, 6},
+    {143, 11}, {81, 9},   {99, 9},   {66, 6},   {199, 9},  {87, 9},   {105, 9},
+    {68, 6},   {123, 9},  {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},
+    {111, 9},  {70, 6},   {129, 9},  {76, 6},   {94, 6},   {65, 5},   {193, 6},
+    {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9},  {152, 7},  {164, 7},
+    {145, 3},  {202, 9},  {89, 9},   {107, 9},  {69, 7},   {125, 9},  {75, 7},
+    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {113, 9},  {71, 7},   {131, 9},
+    {31, 11},  {47, 11},  {7, 9},    {194, 7},  {83, 7},   {55, 11},  {11, 9},
+    {119, 7},  {19, 9},   {35, 9},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
+    {148, 6},  {137, 9},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
+    {59, 11},  {13, 9},   {121, 7},  {21, 9},   {37, 9},   {2, 7},    {209, 12},
+    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {25, 9},   {41, 9},   {4, 7},
+    {193, 6},  {82, 6},   {49, 9},   {8, 7},    {118, 6},  {16, 7},   {32, 7},
+    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {145, 3},  {205, 9},  {156, 8},  {168, 8},  {146, 4},  {180, 8},
+    {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {115, 9},  {72, 8},
+    {133, 9},  {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},
+    {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12},
+    {174, 8},  {148, 6},  {139, 9},  {80, 8},   {98, 8},   {66, 6},   {198, 8},
+    {86, 8},   {61, 11},  {14, 9},   {122, 8},  {22, 9},   {38, 9},   {3, 8},
+    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {26, 9},   {42, 9},
+    {5, 8},    {193, 6},  {82, 6},   {50, 9},   {9, 8},    {118, 6},  {17, 8},
+    {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},
+    {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},
+    {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},
+    {71, 7},   {130, 8},  {28, 9},   {44, 9},   {6, 8},    {194, 7},  {83, 7},
+    {52, 9},   {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12},
+    {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},
+    {197, 7},  {85, 7},   {56, 9},   {12, 8},   {121, 7},  {20, 8},   {36, 8},
+    {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},
+    {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},
+    {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12},
+    {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12}, {209, 12},
+    {209, 12}, {147, 5},  {209, 12}, {150, 5},  {162, 5},  {65, 5},   {209, 12},
+    {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},
+    {209, 12}, {209, 12}, {176, 10}, {148, 6},  {188, 10}, {151, 6},  {163, 6},
+    {66, 6},   {200, 10}, {154, 6},  {166, 6},  {68, 6},   {178, 6},  {74, 6},
+    {92, 6},   {64, 4},   {209, 12}, {157, 6},  {169, 6},  {70, 6},   {181, 6},
+    {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},
+    {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {191, 10}, {152, 7},  {164, 7},  {145, 3},  {203, 10}, {90, 10},
+    {108, 10}, {69, 7},   {126, 10}, {75, 7},   {93, 7},   {64, 4},   {209, 12},
+    {158, 7},  {114, 10}, {71, 7},   {132, 10}, {77, 7},   {95, 7},   {65, 5},
+    {194, 7},  {83, 7},   {101, 7},  {67, 5},   {119, 7},  {73, 5},   {91, 5},
+    {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {138, 10}, {79, 7},
+    {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},  {68, 6},   {121, 7},
+    {74, 6},   {92, 6},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},
+    {127, 7},  {76, 6},   {94, 6},   {4, 7},    {193, 6},  {82, 6},   {100, 6},
+    {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {206, 10},
+    {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},
+    {209, 12}, {159, 8},  {116, 10}, {72, 8},   {134, 10}, {78, 8},   {96, 8},
+    {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},
+    {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},  {140, 10},
+    {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {62, 11},  {15, 10},
+    {122, 8},  {23, 10},  {39, 10},  {3, 8},    {209, 12}, {157, 6},  {110, 8},
+    {70, 6},   {128, 8},  {27, 10},  {43, 10},  {5, 8},    {193, 6},  {82, 6},
+    {51, 10},  {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},
+    {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},
+    {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},  {29, 10},
+    {45, 10},  {6, 8},    {194, 7},  {83, 7},   {53, 10},  {10, 8},   {119, 7},
+    {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},
+    {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {57, 10},
+    {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},
+    {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},
+    {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},
+    {161, 4},  {64, 4},   {209, 12}, {160, 9},  {172, 9},  {147, 5},  {184, 9},
+    {150, 5},  {162, 5},  {65, 5},   {196, 9},  {153, 5},  {165, 5},  {67, 5},
+    {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {175, 9},
+    {148, 6},  {142, 10}, {81, 9},   {99, 9},   {66, 6},   {199, 9},  {87, 9},
+    {105, 9},  {68, 6},   {123, 9},  {74, 6},   {92, 6},   {64, 4},   {209, 12},
+    {157, 6},  {111, 9},  {70, 6},   {129, 9},  {76, 6},   {94, 6},   {65, 5},
+    {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},
+    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9},  {152, 7},
+    {164, 7},  {145, 3},  {202, 9},  {89, 9},   {107, 9},  {69, 7},   {125, 9},
+    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {113, 9},  {71, 7},
+    {131, 9},  {30, 10},  {46, 10},  {7, 9},    {194, 7},  {83, 7},   {54, 10},
+    {11, 9},   {119, 7},  {19, 9},   {35, 9},   {1, 7},    {209, 12}, {209, 12},
+    {173, 7},  {148, 6},  {137, 9},  {79, 7},   {97, 7},   {66, 6},   {197, 7},
+    {85, 7},   {58, 10},  {13, 9},   {121, 7},  {21, 9},   {37, 9},   {2, 7},
+    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {25, 9},   {41, 9},
+    {4, 7},    {193, 6},  {82, 6},   {49, 9},   {8, 7},    {118, 6},  {16, 7},
+    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {145, 3},  {205, 9},  {156, 8},  {168, 8},  {146, 4},
+    {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {115, 9},
+    {72, 8},   {133, 9},  {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},
+    {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
+    {209, 12}, {174, 8},  {148, 6},  {139, 9},  {80, 8},   {98, 8},   {66, 6},
+    {198, 8},  {86, 8},   {60, 10},  {14, 9},   {122, 8},  {22, 9},   {38, 9},
+    {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {26, 9},
+    {42, 9},   {5, 8},    {193, 6},  {82, 6},   {50, 9},   {9, 8},    {118, 6},
+    {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},
+    {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
+    {112, 8},  {71, 7},   {130, 8},  {28, 9},   {44, 9},   {6, 8},    {194, 7},
+    {83, 7},   {52, 9},   {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},
+    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},
+    {66, 6},   {197, 7},  {85, 7},   {56, 9},   {12, 8},   {121, 7},  {20, 8},
+    {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
+    {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},
+    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12},
+    {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12},
+    {209, 12}, {209, 12}, {147, 5},  {209, 12}, {150, 5},  {162, 5},  {65, 5},
+    {209, 12}, {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},
+    {64, 4},   {209, 12}, {209, 12}, {209, 12}, {148, 6},  {209, 12}, {151, 6},
+    {163, 6},  {66, 6},   {209, 12}, {154, 6},  {166, 6},  {68, 6},   {178, 6},
+    {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {169, 6},  {70, 6},
+    {181, 6},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},
+    {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {152, 7},  {164, 7},  {145, 3},  {209, 12},
+    {155, 7},  {167, 7},  {69, 7},   {179, 7},  {75, 7},   {93, 7},   {64, 4},
+    {209, 12}, {158, 7},  {170, 7},  {71, 7},   {182, 7},  {77, 7},   {95, 7},
+    {65, 5},   {194, 7},  {83, 7},   {101, 7},  {67, 5},   {119, 7},  {73, 5},
+    {91, 5},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {185, 7},
+    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},  {68, 6},
+    {121, 7},  {74, 6},   {92, 6},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
+    {70, 6},   {127, 7},  {76, 6},   {94, 6},   {4, 7},    {193, 6},  {82, 6},
+    {100, 6},  {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
+    {208, 12}, {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},
+    {64, 4},   {209, 12}, {159, 8},  {171, 8},  {72, 8},   {183, 8},  {78, 8},
+    {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},
+    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},
+    {186, 8},  {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {104, 8},
+    {68, 6},   {122, 8},  {74, 6},   {92, 6},   {3, 8},    {209, 12}, {157, 6},
+    {110, 8},  {70, 6},   {128, 8},  {76, 6},   {94, 6},   {5, 8},    {193, 6},
+    {82, 6},   {100, 6},  {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},
+    {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},
+    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},
+    {77, 7},   {95, 7},   {6, 8},    {194, 7},  {83, 7},   {101, 7},  {10, 8},
+    {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
+    {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
+    {103, 7},  {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12},
+    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},
+    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},
+    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12},
+    {149, 4},  {161, 4},  {64, 4},   {209, 12}, {160, 9},  {172, 9},  {147, 5},
+    {184, 9},  {150, 5},  {162, 5},  {65, 5},   {196, 9},  {153, 5},  {165, 5},
+    {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12},
+    {175, 9},  {148, 6},  {144, 12}, {81, 9},   {99, 9},   {66, 6},   {199, 9},
+    {87, 9},   {105, 9},  {68, 6},   {123, 9},  {74, 6},   {92, 6},   {64, 4},
+    {209, 12}, {157, 6},  {111, 9},  {70, 6},   {129, 9},  {76, 6},   {94, 6},
+    {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},
+    {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9},
+    {152, 7},  {164, 7},  {145, 3},  {202, 9},  {89, 9},   {107, 9},  {69, 7},
+    {125, 9},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {113, 9},
+    {71, 7},   {131, 9},  {77, 7},   {95, 7},   {7, 9},    {194, 7},  {83, 7},
+    {101, 7},  {11, 9},   {119, 7},  {19, 9},   {35, 9},   {1, 7},    {209, 12},
+    {209, 12}, {173, 7},  {148, 6},  {137, 9},  {79, 7},   {97, 7},   {66, 6},
+    {197, 7},  {85, 7},   {103, 7},  {13, 9},   {121, 7},  {21, 9},   {37, 9},
+    {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {25, 9},
+    {41, 9},   {4, 7},    {193, 6},  {82, 6},   {49, 9},   {8, 7},    {118, 6},
+    {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {145, 3},  {205, 9},  {156, 8},  {168, 8},
+    {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},
+    {115, 9},  {72, 8},   {133, 9},  {78, 8},   {96, 8},   {65, 5},   {195, 8},
+    {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},
+    {209, 12}, {209, 12}, {174, 8},  {148, 6},  {139, 9},  {80, 8},   {98, 8},
+    {66, 6},   {198, 8},  {86, 8},   {104, 8},  {14, 9},   {122, 8},  {22, 9},
+    {38, 9},   {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},
+    {26, 9},   {42, 9},   {5, 8},    {193, 6},  {82, 6},   {50, 9},   {9, 8},
+    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},
+    {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12},
+    {158, 7},  {112, 8},  {71, 7},   {130, 8},  {28, 9},   {44, 9},   {6, 8},
+    {194, 7},  {83, 7},   {52, 9},   {10, 8},   {119, 7},  {18, 8},   {34, 8},
+    {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},
+    {97, 7},   {66, 6},   {197, 7},  {85, 7},   {56, 9},   {12, 8},   {121, 7},
+    {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},
+    {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},
+    {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12},
+    {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},
+    {209, 12}, {209, 12}, {209, 12}, {147, 5},  {209, 12}, {150, 5},  {162, 5},
+    {65, 5},   {209, 12}, {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},
+    {91, 5},   {64, 4},   {209, 12}, {209, 12}, {176, 10}, {148, 6},  {188, 10},
+    {151, 6},  {163, 6},  {66, 6},   {200, 10}, {154, 6},  {166, 6},  {68, 6},
+    {178, 6},  {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {169, 6},
+    {70, 6},   {181, 6},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},
+    {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {191, 10}, {152, 7},  {164, 7},  {145, 3},
+    {203, 10}, {90, 10},  {108, 10}, {69, 7},   {126, 10}, {75, 7},   {93, 7},
+    {64, 4},   {209, 12}, {158, 7},  {114, 10}, {71, 7},   {132, 10}, {77, 7},
+    {95, 7},   {65, 5},   {194, 7},  {83, 7},   {101, 7},  {67, 5},   {119, 7},
+    {73, 5},   {91, 5},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},
+    {138, 10}, {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},
+    {68, 6},   {121, 7},  {74, 6},   {92, 6},   {2, 7},    {209, 12}, {157, 6},
+    {109, 7},  {70, 6},   {127, 7},  {76, 6},   {94, 6},   {4, 7},    {193, 6},
+    {82, 6},   {100, 6},  {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {145, 3},  {206, 10}, {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},
+    {161, 4},  {64, 4},   {209, 12}, {159, 8},  {116, 10}, {72, 8},   {134, 10},
+    {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},
+    {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},
+    {148, 6},  {140, 10}, {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},
+    {63, 12},  {15, 10},  {122, 8},  {23, 10},  {39, 10},  {3, 8},    {209, 12},
+    {157, 6},  {110, 8},  {70, 6},   {128, 8},  {27, 10},  {43, 10},  {5, 8},
+    {193, 6},  {82, 6},   {51, 10},  {9, 8},    {118, 6},  {17, 8},   {33, 8},
+    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},
+    {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},
+    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},
+    {130, 8},  {29, 10},  {45, 10},  {6, 8},    {194, 7},  {83, 7},   {53, 10},
+    {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12},
+    {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},
+    {85, 7},   {57, 10},  {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},
+    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},
+    {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},
+    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},
+    {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12}, {160, 9},  {172, 9},
+    {147, 5},  {184, 9},  {150, 5},  {162, 5},  {65, 5},   {196, 9},  {153, 5},
+    {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
+    {209, 12}, {175, 9},  {148, 6},  {142, 10}, {81, 9},   {99, 9},   {66, 6},
+    {199, 9},  {87, 9},   {105, 9},  {68, 6},   {123, 9},  {74, 6},   {92, 6},
+    {64, 4},   {209, 12}, {157, 6},  {111, 9},  {70, 6},   {129, 9},  {76, 6},
+    {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},
+    {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {190, 9},  {152, 7},  {164, 7},  {145, 3},  {202, 9},  {89, 9},   {107, 9},
+    {69, 7},   {125, 9},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
+    {113, 9},  {71, 7},   {131, 9},  {30, 10},  {46, 10},  {7, 9},    {194, 7},
+    {83, 7},   {54, 10},  {11, 9},   {119, 7},  {19, 9},   {35, 9},   {1, 7},
+    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {137, 9},  {79, 7},   {97, 7},
+    {66, 6},   {197, 7},  {85, 7},   {58, 10},  {13, 9},   {121, 7},  {21, 9},
+    {37, 9},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
+    {25, 9},   {41, 9},   {4, 7},    {193, 6},  {82, 6},   {49, 9},   {8, 7},
+    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {205, 9},  {156, 8},
+    {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12},
+    {159, 8},  {115, 9},  {72, 8},   {133, 9},  {78, 8},   {96, 8},   {65, 5},
+    {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},
+    {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},  {139, 9},  {80, 8},
+    {98, 8},   {66, 6},   {198, 8},  {86, 8},   {60, 10},  {14, 9},   {122, 8},
+    {22, 9},   {38, 9},   {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},
+    {128, 8},  {26, 9},   {42, 9},   {5, 8},    {193, 6},  {82, 6},   {50, 9},
+    {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},
+    {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},
+    {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},  {28, 9},   {44, 9},
+    {6, 8},    {194, 7},  {83, 7},   {52, 9},   {10, 8},   {119, 7},  {18, 8},
+    {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},
+    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {56, 9},   {12, 8},
+    {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
+    {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},
+    {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
+    {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},
+    {64, 4},   {209, 12}, {209, 12}, {209, 12}, {147, 5},  {209, 12}, {150, 5},
+    {162, 5},  {65, 5},   {209, 12}, {153, 5},  {165, 5},  {67, 5},   {177, 5},
+    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {209, 12}, {148, 6},
+    {209, 12}, {151, 6},  {163, 6},  {66, 6},   {209, 12}, {154, 6},  {166, 6},
+    {68, 6},   {178, 6},  {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},
+    {169, 6},  {70, 6},   {181, 6},  {76, 6},   {94, 6},   {65, 5},   {193, 6},
+    {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {192, 11}, {152, 7},  {164, 7},
+    {145, 3},  {204, 11}, {155, 7},  {167, 7},  {69, 7},   {179, 7},  {75, 7},
+    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {170, 7},  {71, 7},   {182, 7},
+    {77, 7},   {95, 7},   {65, 5},   {194, 7},  {83, 7},   {101, 7},  {67, 5},
+    {119, 7},  {73, 5},   {91, 5},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
+    {148, 6},  {185, 7},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
+    {103, 7},  {68, 6},   {121, 7},  {74, 6},   {92, 6},   {2, 7},    {209, 12},
+    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {76, 6},   {94, 6},   {4, 7},
+    {193, 6},  {82, 6},   {100, 6},  {8, 7},    {118, 6},  {16, 7},   {32, 7},
+    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {145, 3},  {207, 11}, {156, 8},  {168, 8},  {146, 4},  {180, 8},
+    {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {117, 11}, {72, 8},
+    {135, 11}, {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},
+    {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12},
+    {174, 8},  {148, 6},  {141, 11}, {80, 8},   {98, 8},   {66, 6},   {198, 8},
+    {86, 8},   {104, 8},  {68, 6},   {122, 8},  {74, 6},   {92, 6},   {3, 8},
+    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {76, 6},   {94, 6},
+    {5, 8},    {193, 6},  {82, 6},   {100, 6},  {9, 8},    {118, 6},  {17, 8},
+    {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},
+    {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},
+    {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},
+    {71, 7},   {130, 8},  {77, 7},   {95, 7},   {6, 8},    {194, 7},  {83, 7},
+    {101, 7},  {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12},
+    {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},
+    {197, 7},  {85, 7},   {103, 7},  {12, 8},   {121, 7},  {20, 8},   {36, 8},
+    {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},
+    {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},
+    {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12},
+    {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12}, {160, 9},
+    {172, 9},  {147, 5},  {184, 9},  {150, 5},  {162, 5},  {65, 5},   {196, 9},
+    {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},
+    {209, 12}, {209, 12}, {175, 9},  {148, 6},  {143, 11}, {81, 9},   {99, 9},
+    {66, 6},   {199, 9},  {87, 9},   {105, 9},  {68, 6},   {123, 9},  {74, 6},
+    {92, 6},   {64, 4},   {209, 12}, {157, 6},  {111, 9},  {70, 6},   {129, 9},
+    {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},
+    {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {190, 9},  {152, 7},  {164, 7},  {145, 3},  {202, 9},  {89, 9},
+    {107, 9},  {69, 7},   {125, 9},  {75, 7},   {93, 7},   {64, 4},   {209, 12},
+    {158, 7},  {113, 9},  {71, 7},   {131, 9},  {31, 11},  {47, 11},  {7, 9},
+    {194, 7},  {83, 7},   {55, 11},  {11, 9},   {119, 7},  {19, 9},   {35, 9},
+    {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {137, 9},  {79, 7},
+    {97, 7},   {66, 6},   {197, 7},  {85, 7},   {59, 11},  {13, 9},   {121, 7},
+    {21, 9},   {37, 9},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},
+    {127, 7},  {25, 9},   {41, 9},   {4, 7},    {193, 6},  {82, 6},   {49, 9},
+    {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {205, 9},
+    {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},
+    {209, 12}, {159, 8},  {115, 9},  {72, 8},   {133, 9},  {78, 8},   {96, 8},
+    {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},
+    {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},  {139, 9},
+    {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {61, 11},  {14, 9},
+    {122, 8},  {22, 9},   {38, 9},   {3, 8},    {209, 12}, {157, 6},  {110, 8},
+    {70, 6},   {128, 8},  {26, 9},   {42, 9},   {5, 8},    {193, 6},  {82, 6},
+    {50, 9},   {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},
+    {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},
+    {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},  {28, 9},
+    {44, 9},   {6, 8},    {194, 7},  {83, 7},   {52, 9},   {10, 8},   {119, 7},
+    {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},
+    {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {56, 9},
+    {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},
+    {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},
+    {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},
+    {161, 4},  {64, 4},   {209, 12}, {209, 12}, {209, 12}, {147, 5},  {209, 12},
+    {150, 5},  {162, 5},  {65, 5},   {209, 12}, {153, 5},  {165, 5},  {67, 5},
+    {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {176, 10},
+    {148, 6},  {188, 10}, {151, 6},  {163, 6},  {66, 6},   {200, 10}, {154, 6},
+    {166, 6},  {68, 6},   {178, 6},  {74, 6},   {92, 6},   {64, 4},   {209, 12},
+    {157, 6},  {169, 6},  {70, 6},   {181, 6},  {76, 6},   {94, 6},   {65, 5},
+    {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},
+    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {191, 10}, {152, 7},
+    {164, 7},  {145, 3},  {203, 10}, {90, 10},  {108, 10}, {69, 7},   {126, 10},
+    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {114, 10}, {71, 7},
+    {132, 10}, {77, 7},   {95, 7},   {65, 5},   {194, 7},  {83, 7},   {101, 7},
+    {67, 5},   {119, 7},  {73, 5},   {91, 5},   {1, 7},    {209, 12}, {209, 12},
+    {173, 7},  {148, 6},  {138, 10}, {79, 7},   {97, 7},   {66, 6},   {197, 7},
+    {85, 7},   {103, 7},  {68, 6},   {121, 7},  {74, 6},   {92, 6},   {2, 7},
+    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {76, 6},   {94, 6},
+    {4, 7},    {193, 6},  {82, 6},   {100, 6},  {8, 7},    {118, 6},  {16, 7},
+    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {145, 3},  {206, 10}, {156, 8},  {168, 8},  {146, 4},
+    {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {116, 10},
+    {72, 8},   {134, 10}, {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},
+    {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
+    {209, 12}, {174, 8},  {148, 6},  {140, 10}, {80, 8},   {98, 8},   {66, 6},
+    {198, 8},  {86, 8},   {62, 11},  {15, 10},  {122, 8},  {23, 10},  {39, 10},
+    {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {27, 10},
+    {43, 10},  {5, 8},    {193, 6},  {82, 6},   {51, 10},  {9, 8},    {118, 6},
+    {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},
+    {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
+    {112, 8},  {71, 7},   {130, 8},  {29, 10},  {45, 10},  {6, 8},    {194, 7},
+    {83, 7},   {53, 10},  {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},
+    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},
+    {66, 6},   {197, 7},  {85, 7},   {57, 10},  {12, 8},   {121, 7},  {20, 8},
+    {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
+    {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},
+    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12},
+    {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12},
+    {160, 9},  {172, 9},  {147, 5},  {184, 9},  {150, 5},  {162, 5},  {65, 5},
+    {196, 9},  {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},
+    {64, 4},   {209, 12}, {209, 12}, {175, 9},  {148, 6},  {142, 10}, {81, 9},
+    {99, 9},   {66, 6},   {199, 9},  {87, 9},   {105, 9},  {68, 6},   {123, 9},
+    {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {111, 9},  {70, 6},
+    {129, 9},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},
+    {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {190, 9},  {152, 7},  {164, 7},  {145, 3},  {202, 9},
+    {89, 9},   {107, 9},  {69, 7},   {125, 9},  {75, 7},   {93, 7},   {64, 4},
+    {209, 12}, {158, 7},  {113, 9},  {71, 7},   {131, 9},  {30, 10},  {46, 10},
+    {7, 9},    {194, 7},  {83, 7},   {54, 10},  {11, 9},   {119, 7},  {19, 9},
+    {35, 9},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {137, 9},
+    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {58, 10},  {13, 9},
+    {121, 7},  {21, 9},   {37, 9},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
+    {70, 6},   {127, 7},  {25, 9},   {41, 9},   {4, 7},    {193, 6},  {82, 6},
+    {49, 9},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
+    {205, 9},  {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},
+    {64, 4},   {209, 12}, {159, 8},  {115, 9},  {72, 8},   {133, 9},  {78, 8},
+    {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},
+    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},
+    {139, 9},  {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {60, 10},
+    {14, 9},   {122, 8},  {22, 9},   {38, 9},   {3, 8},    {209, 12}, {157, 6},
+    {110, 8},  {70, 6},   {128, 8},  {26, 9},   {42, 9},   {5, 8},    {193, 6},
+    {82, 6},   {50, 9},   {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},
+    {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},
+    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},
+    {28, 9},   {44, 9},   {6, 8},    {194, 7},  {83, 7},   {52, 9},   {10, 8},
+    {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
+    {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
+    {56, 9},   {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12},
+    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},
+    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},
+    {0, 6}};
+} // namespace utf8_to_utf16
+} // namespace tables
+} // unnamed namespace
+} // namespace simdutf
+
+#endif // SIMDUTF_UTF8_TO_UTF16_TABLES_H
+/* end file src/tables/utf8_to_utf16_tables.h */
+/* begin file src/tables/utf16_to_utf8_tables.h */
+// file generated by scripts/sse_convert_utf16_to_utf8.py
+#ifndef SIMDUTF_UTF16_TO_UTF8_TABLES_H
+#define SIMDUTF_UTF16_TO_UTF8_TABLES_H
+
+namespace simdutf {
+namespace {
+namespace tables {
+namespace utf16_to_utf8 {
+
+// 1 byte for length, 16 bytes for mask
 const uint8_t pack_1_2_utf8_bytes[256][17] = {
     {16, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14},
     {15, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80},
@@ -10734,14577 +13507,24317 @@ const uint8_t pack_1_2_utf8_bytes[256][17] = {
     {10, 1, 0, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
     {9, 0, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
      0x80},
-    {15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80},
-    {14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
-    {14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
-    {13, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
-    {13, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {13, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80},
-    {13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
-    {13, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {13, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {13, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80},
-    {13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
-    {13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {13, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 3, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 3, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 1, 0, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {9, 0, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
-    {13, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {13, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {13, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {13, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 3, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 3, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 1, 0, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {9, 0, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    {15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80},
+    {14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
+    {14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
+    {13, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
+    {13, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {13, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80},
+    {13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
+    {13, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {13, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {13, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80},
+    {13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
+    {13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {13, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 3, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 3, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 1, 0, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {9, 0, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
+    {13, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {13, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {13, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {13, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 3, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 3, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 1, 0, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {9, 0, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 3, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 3, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 1, 0, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {9, 0, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 3, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 3, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 1, 0, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {9, 0, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {11, 1, 0, 3, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 3, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 1, 0, 3, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {9, 0, 3, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 1, 0, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {9, 0, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {9, 1, 0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {8, 0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80}};
+
+// 1 byte for length, 16 bytes for mask
+const uint8_t pack_1_2_3_utf8_bytes[256][17] = {
+    {12, 2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80},
+    {9, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {11, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {9, 2, 3, 1, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {8, 3, 1, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 0, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {11, 2, 3, 1, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {8, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {10, 3, 1, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {9, 0, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {10, 2, 3, 1, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {7, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {9, 3, 1, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {8, 0, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {9, 2, 3, 1, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {6, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {8, 3, 1, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 0, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 2, 3, 1, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {3, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {5, 3, 1, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 0, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {8, 2, 3, 1, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 2, 3, 1, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {6, 3, 1, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {5, 0, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {11, 2, 3, 1, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {8, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {10, 3, 1, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {9, 0, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {8, 2, 3, 1, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {10, 2, 3, 1, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {7, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {9, 3, 1, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {8, 0, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {9, 2, 3, 1, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {8, 3, 1, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 0, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {10, 2, 3, 1, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {7, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {9, 3, 1, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {8, 0, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 2, 3, 1, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {6, 3, 1, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {5, 0, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {9, 2, 3, 1, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {8, 3, 1, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 0, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {8, 2, 3, 1, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {9, 2, 3, 1, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {6, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {8, 3, 1, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 0, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 2, 3, 1, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {3, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {5, 3, 1, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 0, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {8, 2, 3, 1, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 2, 3, 1, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {6, 3, 1, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {5, 0, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {6, 2, 3, 1, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {3, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {5, 3, 1, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 0, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {3, 2, 3, 1, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80, 0x80},
+    {2, 3, 1, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80, 0x80},
+    {5, 2, 3, 1, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {2, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {4, 3, 1, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {3, 0, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {4, 2, 3, 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80, 0x80},
+    {3, 3, 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {2, 0, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {8, 2, 3, 1, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 2, 3, 1, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {2, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {4, 3, 1, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {3, 0, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {7, 2, 3, 1, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {6, 3, 1, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 0, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {6, 2, 3, 1, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {3, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {5, 3, 1, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 0, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 2, 3, 1, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {6, 3, 1, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 0, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 2, 3, 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80, 0x80},
+    {3, 3, 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {2, 0, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {6, 2, 3, 1, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {3, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {5, 3, 1, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 0, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {5, 2, 3, 1, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {2, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {4, 3, 1, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {3, 0, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {11, 2, 3, 1, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {8, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
      0x80},
-    {13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 3, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 3, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 1, 0, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {9, 0, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 3, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 3, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 1, 0, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {9, 0, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    {10, 3, 1, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {9, 0, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
      0x80},
-    {11, 1, 0, 3, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 3, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 1, 0, 3, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {9, 0, 3, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 1, 0, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {9, 0, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {9, 1, 0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {8, 0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80}};
+    {8, 2, 3, 1, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {10, 2, 3, 1, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {7, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {9, 3, 1, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {8, 0, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {9, 2, 3, 1, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {8, 3, 1, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 0, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {8, 2, 3, 1, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {5, 2, 3, 1, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {2, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80, 0x80},
+    {4, 3, 1, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {3, 0, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {7, 2, 3, 1, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {6, 3, 1, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {5, 0, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {6, 2, 3, 1, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {3, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {5, 3, 1, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 0, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {10, 2, 3, 1, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {7, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {9, 3, 1, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {8, 0, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 2, 3, 1, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {6, 3, 1, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {5, 0, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {9, 2, 3, 1, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {8, 3, 1, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 0, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {8, 2, 3, 1, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {9, 2, 3, 1, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {6, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {8, 3, 1, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 0, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 2, 3, 1, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {3, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {5, 3, 1, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 0, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {8, 2, 3, 1, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 2, 3, 1, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {6, 3, 1, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {5, 0, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {10, 2, 3, 1, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {7, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {9, 3, 1, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {8, 0, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 2, 3, 1, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {6, 3, 1, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {5, 0, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {9, 2, 3, 1, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {8, 3, 1, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 0, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {8, 2, 3, 1, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 2, 3, 1, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {6, 3, 1, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 0, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 2, 3, 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80, 0x80},
+    {3, 3, 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {2, 0, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {6, 2, 3, 1, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {3, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {5, 3, 1, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 0, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {5, 2, 3, 1, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {2, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {4, 3, 1, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {3, 0, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {9, 2, 3, 1, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {6, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {8, 3, 1, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 0, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 2, 3, 1, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {3, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {5, 3, 1, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 0, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {8, 2, 3, 1, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 2, 3, 1, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {6, 3, 1, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {5, 0, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {8, 2, 3, 1, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 2, 3, 1, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {2, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {4, 3, 1, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {3, 0, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {7, 2, 3, 1, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {6, 3, 1, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 0, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {6, 2, 3, 1, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {3, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {5, 3, 1, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 0, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80}};
+
+} // namespace utf16_to_utf8
+} // namespace tables
+} // unnamed namespace
+} // namespace simdutf
+
+#endif // SIMDUTF_UTF16_TO_UTF8_TABLES_H
+/* end file src/tables/utf16_to_utf8_tables.h */
+// End of tables.
+
+// The scalar routines should be included once.
+/* begin file src/scalar/ascii.h */
+#ifndef SIMDUTF_ASCII_H
+#define SIMDUTF_ASCII_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace ascii {
+#if SIMDUTF_IMPLEMENTATION_FALLBACK
+// Only used by the fallback kernel.
+inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  uint64_t pos = 0;
+  // process in blocks of 16 bytes when possible
+  for (; pos + 16 <= len; pos += 16) {
+    uint64_t v1;
+    std::memcpy(&v1, data + pos, sizeof(uint64_t));
+    uint64_t v2;
+    std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+    uint64_t v{v1 | v2};
+    if ((v & 0x8080808080808080) != 0) {
+      return false;
+    }
+  }
+  // process the tail byte-by-byte
+  for (; pos < len; pos++) {
+    if (data[pos] >= 0b10000000) {
+      return false;
+    }
+  }
+  return true;
+}
+#endif
+
+inline simdutf_warn_unused result validate_with_errors(const char *buf,
+                                                       size_t len) noexcept {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  // process in blocks of 16 bytes when possible
+  for (; pos + 16 <= len; pos += 16) {
+    uint64_t v1;
+    std::memcpy(&v1, data + pos, sizeof(uint64_t));
+    uint64_t v2;
+    std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+    uint64_t v{v1 | v2};
+    if ((v & 0x8080808080808080) != 0) {
+      for (; pos < len; pos++) {
+        if (data[pos] >= 0b10000000) {
+          return result(error_code::TOO_LARGE, pos);
+        }
+      }
+    }
+  }
+  // process the tail byte-by-byte
+  for (; pos < len; pos++) {
+    if (data[pos] >= 0b10000000) {
+      return result(error_code::TOO_LARGE, pos);
+    }
+  }
+  return result(error_code::SUCCESS, pos);
+}
+
+} // namespace ascii
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/ascii.h */
+/* begin file src/scalar/latin1.h */
+#ifndef SIMDUTF_LATIN1_H
+#define SIMDUTF_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace latin1 {
+
+inline size_t utf32_length_from_latin1(size_t len) {
+  // We are not BOM aware.
+  return len; // a utf32 unit will always represent 1 latin1 character
+}
+
+inline size_t utf8_length_from_latin1(const char *buf, size_t len) {
+  const uint8_t *c = reinterpret_cast<const uint8_t *>(buf);
+  size_t answer = 0;
+  for (size_t i = 0; i < len; i++) {
+    if ((c[i] >> 7)) {
+      answer++;
+    }
+  }
+  return answer + len;
+}
+
+inline size_t utf16_length_from_latin1(size_t len) { return len; }
+
+} // namespace latin1
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/latin1.h */
+
+/* begin file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
+#ifndef SIMDUTF_VALID_UTF32_TO_UTF8_H
+#define SIMDUTF_VALID_UTF32_TO_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_utf8 {
+
+#if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
+// only used by the fallback and POWER kernel
+inline size_t convert_valid(const char32_t *buf, size_t len,
+                            char *utf8_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  size_t pos = 0;
+  char *start{utf8_output};
+  while (pos < len) {
+    // try to convert the next block of 2 ASCII characters
+    if (pos + 2 <=
+        len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
+        *utf8_output++ = char(buf[pos]);
+        *utf8_output++ = char(buf[pos + 1]);
+        pos += 2;
+        continue;
+      }
+    }
+    uint32_t word = data[pos];
+    if ((word & 0xFFFFFF80) == 0) {
+      // will generate one UTF-8 bytes
+      *utf8_output++ = char(word);
+      pos++;
+    } else if ((word & 0xFFFFF800) == 0) {
+      // will generate two UTF-8 bytes
+      // we have 0b110XXXXX 0b10XXXXXX
+      *utf8_output++ = char((word >> 6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else if ((word & 0xFFFF0000) == 0) {
+      // will generate three UTF-8 bytes
+      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((word >> 12) | 0b11100000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else {
+      // will generate four UTF-8 bytes
+      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((word >> 18) | 0b11110000);
+      *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    }
+  }
+  return utf8_output - start;
+}
+#endif // SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
+
+} // namespace utf32_to_utf8
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
+/* begin file src/scalar/utf32_to_utf8/utf32_to_utf8.h */
+#ifndef SIMDUTF_UTF32_TO_UTF8_H
+#define SIMDUTF_UTF32_TO_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_utf8 {
+
+inline size_t convert(const char32_t *buf, size_t len, char *utf8_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  size_t pos = 0;
+  char *start{utf8_output};
+  while (pos < len) {
+    // try to convert the next block of 2 ASCII characters
+    if (pos + 2 <=
+        len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
+        *utf8_output++ = char(buf[pos]);
+        *utf8_output++ = char(buf[pos + 1]);
+        pos += 2;
+        continue;
+      }
+    }
+    uint32_t word = data[pos];
+    if ((word & 0xFFFFFF80) == 0) {
+      // will generate one UTF-8 bytes
+      *utf8_output++ = char(word);
+      pos++;
+    } else if ((word & 0xFFFFF800) == 0) {
+      // will generate two UTF-8 bytes
+      // we have 0b110XXXXX 0b10XXXXXX
+      *utf8_output++ = char((word >> 6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else if ((word & 0xFFFF0000) == 0) {
+      // will generate three UTF-8 bytes
+      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+      if (word >= 0xD800 && word <= 0xDFFF) {
+        return 0;
+      }
+      *utf8_output++ = char((word >> 12) | 0b11100000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else {
+      // will generate four UTF-8 bytes
+      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+      if (word > 0x10FFFF) {
+        return 0;
+      }
+      *utf8_output++ = char((word >> 18) | 0b11110000);
+      *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    }
+  }
+  return utf8_output - start;
+}
+
+inline result convert_with_errors(const char32_t *buf, size_t len,
+                                  char *utf8_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  size_t pos = 0;
+  char *start{utf8_output};
+  while (pos < len) {
+    // try to convert the next block of 2 ASCII characters
+    if (pos + 2 <=
+        len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
+        *utf8_output++ = char(buf[pos]);
+        *utf8_output++ = char(buf[pos + 1]);
+        pos += 2;
+        continue;
+      }
+    }
+    uint32_t word = data[pos];
+    if ((word & 0xFFFFFF80) == 0) {
+      // will generate one UTF-8 bytes
+      *utf8_output++ = char(word);
+      pos++;
+    } else if ((word & 0xFFFFF800) == 0) {
+      // will generate two UTF-8 bytes
+      // we have 0b110XXXXX 0b10XXXXXX
+      *utf8_output++ = char((word >> 6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else if ((word & 0xFFFF0000) == 0) {
+      // will generate three UTF-8 bytes
+      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+      if (word >= 0xD800 && word <= 0xDFFF) {
+        return result(error_code::SURROGATE, pos);
+      }
+      *utf8_output++ = char((word >> 12) | 0b11100000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else {
+      // will generate four UTF-8 bytes
+      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+      if (word > 0x10FFFF) {
+        return result(error_code::TOO_LARGE, pos);
+      }
+      *utf8_output++ = char((word >> 18) | 0b11110000);
+      *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    }
+  }
+  return result(error_code::SUCCESS, utf8_output - start);
+}
+
+} // namespace utf32_to_utf8
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf32_to_utf8/utf32_to_utf8.h */
+
+/* begin file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */
+#ifndef SIMDUTF_VALID_UTF32_TO_UTF16_H
+#define SIMDUTF_VALID_UTF32_TO_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_utf16 {
+
+template <endianness big_endian>
+inline size_t convert_valid(const char32_t *buf, size_t len,
+                            char16_t *utf16_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+  while (pos < len) {
+    uint32_t word = data[pos];
+    if ((word & 0xFFFF0000) == 0) {
+      // will not generate a surrogate pair
+      *utf16_output++ = !match_system(big_endian)
+                            ? char16_t(utf16::swap_bytes(uint16_t(word)))
+                            : char16_t(word);
+      pos++;
+    } else {
+      // will generate a surrogate pair
+      word -= 0x10000;
+      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+      if (!match_system(big_endian)) {
+        high_surrogate = utf16::swap_bytes(high_surrogate);
+        low_surrogate = utf16::swap_bytes(low_surrogate);
+      }
+      *utf16_output++ = char16_t(high_surrogate);
+      *utf16_output++ = char16_t(low_surrogate);
+      pos++;
+    }
+  }
+  return utf16_output - start;
+}
+
+} // namespace utf32_to_utf16
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */
+/* begin file src/scalar/utf32_to_utf16/utf32_to_utf16.h */
+#ifndef SIMDUTF_UTF32_TO_UTF16_H
+#define SIMDUTF_UTF32_TO_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_utf16 {
+
+template <endianness big_endian>
+inline size_t convert(const char32_t *buf, size_t len, char16_t *utf16_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+  while (pos < len) {
+    uint32_t word = data[pos];
+    if ((word & 0xFFFF0000) == 0) {
+      if (word >= 0xD800 && word <= 0xDFFF) {
+        return 0;
+      }
+      // will not generate a surrogate pair
+      *utf16_output++ = !match_system(big_endian)
+                            ? char16_t(utf16::swap_bytes(uint16_t(word)))
+                            : char16_t(word);
+    } else {
+      // will generate a surrogate pair
+      if (word > 0x10FFFF) {
+        return 0;
+      }
+      word -= 0x10000;
+      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+      if (!match_system(big_endian)) {
+        high_surrogate = utf16::swap_bytes(high_surrogate);
+        low_surrogate = utf16::swap_bytes(low_surrogate);
+      }
+      *utf16_output++ = char16_t(high_surrogate);
+      *utf16_output++ = char16_t(low_surrogate);
+    }
+    pos++;
+  }
+  return utf16_output - start;
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char32_t *buf, size_t len,
+                                  char16_t *utf16_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+  while (pos < len) {
+    uint32_t word = data[pos];
+    if ((word & 0xFFFF0000) == 0) {
+      if (word >= 0xD800 && word <= 0xDFFF) {
+        return result(error_code::SURROGATE, pos);
+      }
+      // will not generate a surrogate pair
+      *utf16_output++ = !match_system(big_endian)
+                            ? char16_t(utf16::swap_bytes(uint16_t(word)))
+                            : char16_t(word);
+    } else {
+      // will generate a surrogate pair
+      if (word > 0x10FFFF) {
+        return result(error_code::TOO_LARGE, pos);
+      }
+      word -= 0x10000;
+      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+      if (!match_system(big_endian)) {
+        high_surrogate = utf16::swap_bytes(high_surrogate);
+        low_surrogate = utf16::swap_bytes(low_surrogate);
+      }
+      *utf16_output++ = char16_t(high_surrogate);
+      *utf16_output++ = char16_t(low_surrogate);
+    }
+    pos++;
+  }
+  return result(error_code::SUCCESS, utf16_output - start);
+}
+
+} // namespace utf32_to_utf16
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf32_to_utf16/utf32_to_utf16.h */
+
+/* begin file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
+#ifndef SIMDUTF_VALID_UTF16_TO_UTF8_H
+#define SIMDUTF_VALID_UTF16_TO_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_utf8 {
+
+template <endianness big_endian>
+inline size_t convert_valid(const char16_t *buf, size_t len,
+                            char *utf8_output) {
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char *start{utf8_output};
+  while (pos < len) {
+    // try to convert the next block of 4 ASCII characters
+    if (pos + 4 <=
+        len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if (!match_system(big_endian)) {
+        v = (v >> 8) | (v << (64 - 8));
+      }
+      if ((v & 0xFF80FF80FF80FF80) == 0) {
+        size_t final_pos = pos + 4;
+        while (pos < final_pos) {
+          *utf8_output++ = !match_system(big_endian)
+                               ? char(utf16::swap_bytes(buf[pos]))
+                               : char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+
+    uint16_t word =
+        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    if ((word & 0xFF80) == 0) {
+      // will generate one UTF-8 bytes
+      *utf8_output++ = char(word);
+      pos++;
+    } else if ((word & 0xF800) == 0) {
+      // will generate two UTF-8 bytes
+      // we have 0b110XXXXX 0b10XXXXXX
+      *utf8_output++ = char((word >> 6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else if ((word & 0xF800) != 0xD800) {
+      // will generate three UTF-8 bytes
+      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((word >> 12) | 0b11100000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else {
+      // must be a surrogate pair
+      uint16_t diff = uint16_t(word - 0xD800);
+      if (pos + 1 >= len) {
+        return 0;
+      } // minimal bound checking
+      uint16_t next_word = !match_system(big_endian)
+                               ? utf16::swap_bytes(data[pos + 1])
+                               : data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      // will generate four UTF-8 bytes
+      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((value >> 18) | 0b11110000);
+      *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((value & 0b111111) | 0b10000000);
+      pos += 2;
+    }
+  }
+  return utf8_output - start;
+}
+
+} // namespace utf16_to_utf8
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
+/* begin file src/scalar/utf16_to_utf8/utf16_to_utf8.h */
+#ifndef SIMDUTF_UTF16_TO_UTF8_H
+#define SIMDUTF_UTF16_TO_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_utf8 {
+
+template <endianness big_endian>
+inline size_t convert(const char16_t *buf, size_t len, char *utf8_output) {
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char *start{utf8_output};
+  while (pos < len) {
+    // try to convert the next block of 8 bytes
+    if (pos + 4 <=
+        len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if (!match_system(big_endian)) {
+        v = (v >> 8) | (v << (64 - 8));
+      }
+      if ((v & 0xFF80FF80FF80FF80) == 0) {
+        size_t final_pos = pos + 4;
+        while (pos < final_pos) {
+          *utf8_output++ = !match_system(big_endian)
+                               ? char(utf16::swap_bytes(buf[pos]))
+                               : char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint16_t word =
+        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    if ((word & 0xFF80) == 0) {
+      // will generate one UTF-8 bytes
+      *utf8_output++ = char(word);
+      pos++;
+    } else if ((word & 0xF800) == 0) {
+      // will generate two UTF-8 bytes
+      // we have 0b110XXXXX 0b10XXXXXX
+      *utf8_output++ = char((word >> 6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else if ((word & 0xF800) != 0xD800) {
+      // will generate three UTF-8 bytes
+      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((word >> 12) | 0b11100000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else {
+      // must be a surrogate pair
+      if (pos + 1 >= len) {
+        return 0;
+      }
+      uint16_t diff = uint16_t(word - 0xD800);
+      if (diff > 0x3FF) {
+        return 0;
+      }
+      uint16_t next_word = !match_system(big_endian)
+                               ? utf16::swap_bytes(data[pos + 1])
+                               : data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      if (diff2 > 0x3FF) {
+        return 0;
+      }
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      // will generate four UTF-8 bytes
+      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((value >> 18) | 0b11110000);
+      *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((value & 0b111111) | 0b10000000);
+      pos += 2;
+    }
+  }
+  return utf8_output - start;
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char16_t *buf, size_t len,
+                                  char *utf8_output) {
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char *start{utf8_output};
+  while (pos < len) {
+    // try to convert the next block of 8 bytes
+    if (pos + 4 <=
+        len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if (!match_system(big_endian))
+        v = (v >> 8) | (v << (64 - 8));
+      if ((v & 0xFF80FF80FF80FF80) == 0) {
+        size_t final_pos = pos + 4;
+        while (pos < final_pos) {
+          *utf8_output++ = !match_system(big_endian)
+                               ? char(utf16::swap_bytes(buf[pos]))
+                               : char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint16_t word =
+        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    if ((word & 0xFF80) == 0) {
+      // will generate one UTF-8 bytes
+      *utf8_output++ = char(word);
+      pos++;
+    } else if ((word & 0xF800) == 0) {
+      // will generate two UTF-8 bytes
+      // we have 0b110XXXXX 0b10XXXXXX
+      *utf8_output++ = char((word >> 6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else if ((word & 0xF800) != 0xD800) {
+      // will generate three UTF-8 bytes
+      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((word >> 12) | 0b11100000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else {
+      // must be a surrogate pair
+      if (pos + 1 >= len) {
+        return result(error_code::SURROGATE, pos);
+      }
+      uint16_t diff = uint16_t(word - 0xD800);
+      if (diff > 0x3FF) {
+        return result(error_code::SURROGATE, pos);
+      }
+      uint16_t next_word = !match_system(big_endian)
+                               ? utf16::swap_bytes(data[pos + 1])
+                               : data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      if (diff2 > 0x3FF) {
+        return result(error_code::SURROGATE, pos);
+      }
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      // will generate four UTF-8 bytes
+      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((value >> 18) | 0b11110000);
+      *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((value & 0b111111) | 0b10000000);
+      pos += 2;
+    }
+  }
+  return result(error_code::SUCCESS, utf8_output - start);
+}
+
+} // namespace utf16_to_utf8
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf16_to_utf8/utf16_to_utf8.h */
+
+/* begin file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */
+#ifndef SIMDUTF_VALID_UTF16_TO_UTF32_H
+#define SIMDUTF_VALID_UTF16_TO_UTF32_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_utf32 {
+
+template <endianness big_endian>
+inline size_t convert_valid(const char16_t *buf, size_t len,
+                            char32_t *utf32_output) {
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char32_t *start{utf32_output};
+  while (pos < len) {
+    uint16_t word =
+        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    if ((word & 0xF800) != 0xD800) {
+      // No surrogate pair, extend 16-bit word to 32-bit word
+      *utf32_output++ = char32_t(word);
+      pos++;
+    } else {
+      // must be a surrogate pair
+      uint16_t diff = uint16_t(word - 0xD800);
+      if (pos + 1 >= len) {
+        return 0;
+      } // minimal bound checking
+      uint16_t next_word = !match_system(big_endian)
+                               ? utf16::swap_bytes(data[pos + 1])
+                               : data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      *utf32_output++ = char32_t(value);
+      pos += 2;
+    }
+  }
+  return utf32_output - start;
+}
+
+} // namespace utf16_to_utf32
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */
+/* begin file src/scalar/utf16_to_utf32/utf16_to_utf32.h */
+#ifndef SIMDUTF_UTF16_TO_UTF32_H
+#define SIMDUTF_UTF16_TO_UTF32_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_utf32 {
+
+template <endianness big_endian>
+inline size_t convert(const char16_t *buf, size_t len, char32_t *utf32_output) {
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char32_t *start{utf32_output};
+  while (pos < len) {
+    uint16_t word =
+        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    if ((word & 0xF800) != 0xD800) {
+      // No surrogate pair, extend 16-bit word to 32-bit word
+      *utf32_output++ = char32_t(word);
+      pos++;
+    } else {
+      // must be a surrogate pair
+      uint16_t diff = uint16_t(word - 0xD800);
+      if (diff > 0x3FF) {
+        return 0;
+      }
+      if (pos + 1 >= len) {
+        return 0;
+      } // minimal bound checking
+      uint16_t next_word = !match_system(big_endian)
+                               ? utf16::swap_bytes(data[pos + 1])
+                               : data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      if (diff2 > 0x3FF) {
+        return 0;
+      }
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      *utf32_output++ = char32_t(value);
+      pos += 2;
+    }
+  }
+  return utf32_output - start;
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char16_t *buf, size_t len,
+                                  char32_t *utf32_output) {
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char32_t *start{utf32_output};
+  while (pos < len) {
+    uint16_t word =
+        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    if ((word & 0xF800) != 0xD800) {
+      // No surrogate pair, extend 16-bit word to 32-bit word
+      *utf32_output++ = char32_t(word);
+      pos++;
+    } else {
+      // must be a surrogate pair
+      uint16_t diff = uint16_t(word - 0xD800);
+      if (diff > 0x3FF) {
+        return result(error_code::SURROGATE, pos);
+      }
+      if (pos + 1 >= len) {
+        return result(error_code::SURROGATE, pos);
+      } // minimal bound checking
+      uint16_t next_word = !match_system(big_endian)
+                               ? utf16::swap_bytes(data[pos + 1])
+                               : data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      if (diff2 > 0x3FF) {
+        return result(error_code::SURROGATE, pos);
+      }
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      *utf32_output++ = char32_t(value);
+      pos += 2;
+    }
+  }
+  return result(error_code::SUCCESS, utf32_output - start);
+}
+
+} // namespace utf16_to_utf32
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf16_to_utf32/utf16_to_utf32.h */
+
+/* begin file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
+#ifndef SIMDUTF_VALID_UTF8_TO_UTF16_H
+#define SIMDUTF_VALID_UTF8_TO_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_utf16 {
+
+template <endianness big_endian>
+inline size_t convert_valid(const char *buf, size_t len,
+                            char16_t *utf16_output) {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+  while (pos < len) {
+    // try to convert the next block of 8 ASCII bytes
+    if (pos + 8 <=
+        len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0x8080808080808080) == 0) {
+        size_t final_pos = pos + 8;
+        while (pos < final_pos) {
+          *utf16_output++ = !match_system(big_endian)
+                                ? char16_t(utf16::swap_bytes(buf[pos]))
+                                : char16_t(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *utf16_output++ = !match_system(big_endian)
+                            ? char16_t(utf16::swap_bytes(leading_byte))
+                            : char16_t(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) {
+      // We have a two-byte UTF-8, it should become
+      // a single UTF-16 word.
+      if (pos + 1 >= len) {
+        break;
+      } // minimal bound checking
+      uint16_t code_point = uint16_t(((leading_byte & 0b00011111) << 6) |
+                                     (data[pos + 1] & 0b00111111));
+      if (!match_system(big_endian)) {
+        code_point = utf16::swap_bytes(uint16_t(code_point));
+      }
+      *utf16_output++ = char16_t(code_point);
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8, it should become
+      // a single UTF-16 word.
+      if (pos + 2 >= len) {
+        break;
+      } // minimal bound checking
+      uint16_t code_point = uint16_t(((leading_byte & 0b00001111) << 12) |
+                                     ((data[pos + 1] & 0b00111111) << 6) |
+                                     (data[pos + 2] & 0b00111111));
+      if (!match_system(big_endian)) {
+        code_point = utf16::swap_bytes(uint16_t(code_point));
+      }
+      *utf16_output++ = char16_t(code_point);
+      pos += 3;
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      if (pos + 3 >= len) {
+        break;
+      } // minimal bound checking
+      uint32_t code_point = ((leading_byte & 0b00000111) << 18) |
+                            ((data[pos + 1] & 0b00111111) << 12) |
+                            ((data[pos + 2] & 0b00111111) << 6) |
+                            (data[pos + 3] & 0b00111111);
+      code_point -= 0x10000;
+      uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
+      uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
+      if (!match_system(big_endian)) {
+        high_surrogate = utf16::swap_bytes(high_surrogate);
+        low_surrogate = utf16::swap_bytes(low_surrogate);
+      }
+      *utf16_output++ = char16_t(high_surrogate);
+      *utf16_output++ = char16_t(low_surrogate);
+      pos += 4;
+    } else {
+      // we may have a continuation but we do not do error checking
+      return 0;
+    }
+  }
+  return utf16_output - start;
+}
+
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
+/* begin file src/scalar/utf8_to_utf16/utf8_to_utf16.h */
+#ifndef SIMDUTF_UTF8_TO_UTF16_H
+#define SIMDUTF_UTF8_TO_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_utf16 {
+
+template <endianness big_endian>
+inline size_t convert(const char *buf, size_t len, char16_t *utf16_output) {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <=
+        len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        size_t final_pos = pos + 16;
+        while (pos < final_pos) {
+          *utf16_output++ = !match_system(big_endian)
+                                ? char16_t(utf16::swap_bytes(buf[pos]))
+                                : char16_t(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *utf16_output++ = !match_system(big_endian)
+                            ? char16_t(utf16::swap_bytes(leading_byte))
+                            : char16_t(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) {
+      // We have a two-byte UTF-8, it should become
+      // a single UTF-16 word.
+      if (pos + 1 >= len) {
+        return 0;
+      } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+      // range check
+      uint32_t code_point =
+          (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if (code_point < 0x80 || 0x7ff < code_point) {
+        return 0;
+      }
+      if (!match_system(big_endian)) {
+        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+      }
+      *utf16_output++ = char16_t(code_point);
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8, it should become
+      // a single UTF-16 word.
+      if (pos + 2 >= len) {
+        return 0;
+      } // minimal bound checking
+
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+      // range check
+      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
+                            (data[pos + 1] & 0b00111111) << 6 |
+                            (data[pos + 2] & 0b00111111);
+      if (code_point < 0x800 || 0xffff < code_point ||
+          (0xd7ff < code_point && code_point < 0xe000)) {
+        return 0;
+      }
+      if (!match_system(big_endian)) {
+        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+      }
+      *utf16_output++ = char16_t(code_point);
+      pos += 3;
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      if (pos + 3 >= len) {
+        return 0;
+      } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+
+      // range check
+      uint32_t code_point = (leading_byte & 0b00000111) << 18 |
+                            (data[pos + 1] & 0b00111111) << 12 |
+                            (data[pos + 2] & 0b00111111) << 6 |
+                            (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff || 0x10ffff < code_point) {
+        return 0;
+      }
+      code_point -= 0x10000;
+      uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
+      uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
+      if (!match_system(big_endian)) {
+        high_surrogate = utf16::swap_bytes(high_surrogate);
+        low_surrogate = utf16::swap_bytes(low_surrogate);
+      }
+      *utf16_output++ = char16_t(high_surrogate);
+      *utf16_output++ = char16_t(low_surrogate);
+      pos += 4;
+    } else {
+      return 0;
+    }
+  }
+  return utf16_output - start;
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char *buf, size_t len,
+                                  char16_t *utf16_output) {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <=
+        len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        size_t final_pos = pos + 16;
+        while (pos < final_pos) {
+          *utf16_output++ = !match_system(big_endian)
+                                ? char16_t(utf16::swap_bytes(buf[pos]))
+                                : char16_t(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *utf16_output++ = !match_system(big_endian)
+                            ? char16_t(utf16::swap_bytes(leading_byte))
+                            : char16_t(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) {
+      // We have a two-byte UTF-8, it should become
+      // a single UTF-16 word.
+      if (pos + 1 >= len) {
+        return result(error_code::TOO_SHORT, pos);
+      } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      // range check
+      uint32_t code_point =
+          (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if (code_point < 0x80 || 0x7ff < code_point) {
+        return result(error_code::OVERLONG, pos);
+      }
+      if (!match_system(big_endian)) {
+        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+      }
+      *utf16_output++ = char16_t(code_point);
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8, it should become
+      // a single UTF-16 word.
+      if (pos + 2 >= len) {
+        return result(error_code::TOO_SHORT, pos);
+      } // minimal bound checking
+
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      // range check
+      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
+                            (data[pos + 1] & 0b00111111) << 6 |
+                            (data[pos + 2] & 0b00111111);
+      if ((code_point < 0x800) || (0xffff < code_point)) {
+        return result(error_code::OVERLONG, pos);
+      }
+      if (0xd7ff < code_point && code_point < 0xe000) {
+        return result(error_code::SURROGATE, pos);
+      }
+      if (!match_system(big_endian)) {
+        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+      }
+      *utf16_output++ = char16_t(code_point);
+      pos += 3;
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      if (pos + 3 >= len) {
+        return result(error_code::TOO_SHORT, pos);
+      } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+
+      // range check
+      uint32_t code_point = (leading_byte & 0b00000111) << 18 |
+                            (data[pos + 1] & 0b00111111) << 12 |
+                            (data[pos + 2] & 0b00111111) << 6 |
+                            (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff) {
+        return result(error_code::OVERLONG, pos);
+      }
+      if (0x10ffff < code_point) {
+        return result(error_code::TOO_LARGE, pos);
+      }
+      code_point -= 0x10000;
+      uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
+      uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
+      if (!match_system(big_endian)) {
+        high_surrogate = utf16::swap_bytes(high_surrogate);
+        low_surrogate = utf16::swap_bytes(low_surrogate);
+      }
+      *utf16_output++ = char16_t(high_surrogate);
+      *utf16_output++ = char16_t(low_surrogate);
+      pos += 4;
+    } else {
+      // we either have too many continuation bytes or an invalid leading byte
+      if ((leading_byte & 0b11000000) == 0b10000000) {
+        return result(error_code::TOO_LONG, pos);
+      } else {
+        return result(error_code::HEADER_BITS, pos);
+      }
+    }
+  }
+  return result(error_code::SUCCESS, utf16_output - start);
+}
+
+/**
+ * When rewind_and_convert_with_errors is called, we are pointing at 'buf' and
+ * we have up to len input bytes left, and we encountered some error. It is
+ * possible that the error is at 'buf' exactly, but it could also be in the
+ * previous bytes  (up to 3 bytes back).
+ *
+ * prior_bytes indicates how many bytes, prior to 'buf' may belong to the
+ * current memory section and can be safely accessed. We prior_bytes to access
+ * safely up to three bytes before 'buf'.
+ *
+ * The caller is responsible to ensure that len > 0.
+ *
+ * If the error is believed to have occurred prior to 'buf', the count value
+ * contain in the result will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
+ */
+template <endianness endian>
+inline result rewind_and_convert_with_errors(size_t prior_bytes,
+                                             const char *buf, size_t len,
+                                             char16_t *utf16_output) {
+  size_t extra_len{0};
+  // We potentially need to go back in time and find a leading byte.
+  // In theory '3' would be sufficient, but sometimes the error can go back
+  // quite far.
+  size_t how_far_back = prior_bytes;
+  // size_t how_far_back = 3; // 3 bytes in the past + current position
+  // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
+  bool found_leading_bytes{false};
+  // important: it is i <= how_far_back and not 'i < how_far_back'.
+  for (size_t i = 0; i <= how_far_back; i++) {
+    unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
+    found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
+    if (found_leading_bytes) {
+      if (i > 0 && byte < 128) {
+        // If we had to go back and the leading byte is ascii
+        // then we can stop right away.
+        return result(error_code::TOO_LONG, 0 - i + 1);
+      }
+      buf -= i;
+      extra_len = i;
+      break;
+    }
+  }
+  //
+  // It is possible for this function to return a negative count in its result.
+  // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
+  // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
+  // unsigned integral type of the result of the sizeof operator
+  //
+  // An unsigned type will simply wrap round arithmetically (well defined).
+  //
+  if (!found_leading_bytes) {
+    // If how_far_back == 3, we may have four consecutive continuation bytes!!!
+    // [....] [continuation] [continuation] [continuation] | [buf is
+    // continuation] Or we possibly have a stream that does not start with a
+    // leading byte.
+    return result(error_code::TOO_LONG, 0 - how_far_back);
+  }
+  result res = convert_with_errors<endian>(buf, len + extra_len, utf16_output);
+  if (res.error) {
+    res.count -= extra_len;
+  }
+  return res;
+}
+
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf8_to_utf16/utf8_to_utf16.h */
+
+/* begin file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */
+#ifndef SIMDUTF_VALID_UTF8_TO_UTF32_H
+#define SIMDUTF_VALID_UTF8_TO_UTF32_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_utf32 {
+
+inline size_t convert_valid(const char *buf, size_t len,
+                            char32_t *utf32_output) {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char32_t *start{utf32_output};
+  while (pos < len) {
+    // try to convert the next block of 8 ASCII bytes
+    if (pos + 8 <=
+        len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0x8080808080808080) == 0) {
+        size_t final_pos = pos + 8;
+        while (pos < final_pos) {
+          *utf32_output++ = char32_t(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *utf32_output++ = char32_t(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) {
+      // We have a two-byte UTF-8
+      if (pos + 1 >= len) {
+        break;
+      } // minimal bound checking
+      *utf32_output++ = char32_t(((leading_byte & 0b00011111) << 6) |
+                                 (data[pos + 1] & 0b00111111));
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8
+      if (pos + 2 >= len) {
+        break;
+      } // minimal bound checking
+      *utf32_output++ = char32_t(((leading_byte & 0b00001111) << 12) |
+                                 ((data[pos + 1] & 0b00111111) << 6) |
+                                 (data[pos + 2] & 0b00111111));
+      pos += 3;
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      if (pos + 3 >= len) {
+        break;
+      } // minimal bound checking
+      uint32_t code_word = ((leading_byte & 0b00000111) << 18) |
+                           ((data[pos + 1] & 0b00111111) << 12) |
+                           ((data[pos + 2] & 0b00111111) << 6) |
+                           (data[pos + 3] & 0b00111111);
+      *utf32_output++ = char32_t(code_word);
+      pos += 4;
+    } else {
+      // we may have a continuation but we do not do error checking
+      return 0;
+    }
+  }
+  return utf32_output - start;
+}
+
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */
+/* begin file src/scalar/utf8_to_utf32/utf8_to_utf32.h */
+#ifndef SIMDUTF_UTF8_TO_UTF32_H
+#define SIMDUTF_UTF8_TO_UTF32_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_utf32 {
+
+inline size_t convert(const char *buf, size_t len, char32_t *utf32_output) {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char32_t *start{utf32_output};
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <=
+        len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        size_t final_pos = pos + 16;
+        while (pos < final_pos) {
+          *utf32_output++ = char32_t(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *utf32_output++ = char32_t(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) {
+      // We have a two-byte UTF-8
+      if (pos + 1 >= len) {
+        return 0;
+      } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+      // range check
+      uint32_t code_point =
+          (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if (code_point < 0x80 || 0x7ff < code_point) {
+        return 0;
+      }
+      *utf32_output++ = char32_t(code_point);
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8
+      if (pos + 2 >= len) {
+        return 0;
+      } // minimal bound checking
+
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+      // range check
+      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
+                            (data[pos + 1] & 0b00111111) << 6 |
+                            (data[pos + 2] & 0b00111111);
+      if (code_point < 0x800 || 0xffff < code_point ||
+          (0xd7ff < code_point && code_point < 0xe000)) {
+        return 0;
+      }
+      *utf32_output++ = char32_t(code_point);
+      pos += 3;
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      if (pos + 3 >= len) {
+        return 0;
+      } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+
+      // range check
+      uint32_t code_point = (leading_byte & 0b00000111) << 18 |
+                            (data[pos + 1] & 0b00111111) << 12 |
+                            (data[pos + 2] & 0b00111111) << 6 |
+                            (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff || 0x10ffff < code_point) {
+        return 0;
+      }
+      *utf32_output++ = char32_t(code_point);
+      pos += 4;
+    } else {
+      return 0;
+    }
+  }
+  return utf32_output - start;
+}
+
+inline result convert_with_errors(const char *buf, size_t len,
+                                  char32_t *utf32_output) {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char32_t *start{utf32_output};
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <=
+        len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        size_t final_pos = pos + 16;
+        while (pos < final_pos) {
+          *utf32_output++ = char32_t(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *utf32_output++ = char32_t(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) {
+      // We have a two-byte UTF-8
+      if (pos + 1 >= len) {
+        return result(error_code::TOO_SHORT, pos);
+      } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      // range check
+      uint32_t code_point =
+          (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if (code_point < 0x80 || 0x7ff < code_point) {
+        return result(error_code::OVERLONG, pos);
+      }
+      *utf32_output++ = char32_t(code_point);
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8
+      if (pos + 2 >= len) {
+        return result(error_code::TOO_SHORT, pos);
+      } // minimal bound checking
+
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      // range check
+      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
+                            (data[pos + 1] & 0b00111111) << 6 |
+                            (data[pos + 2] & 0b00111111);
+      if (code_point < 0x800 || 0xffff < code_point) {
+        return result(error_code::OVERLONG, pos);
+      }
+      if (0xd7ff < code_point && code_point < 0xe000) {
+        return result(error_code::SURROGATE, pos);
+      }
+      *utf32_output++ = char32_t(code_point);
+      pos += 3;
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      if (pos + 3 >= len) {
+        return result(error_code::TOO_SHORT, pos);
+      } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+
+      // range check
+      uint32_t code_point = (leading_byte & 0b00000111) << 18 |
+                            (data[pos + 1] & 0b00111111) << 12 |
+                            (data[pos + 2] & 0b00111111) << 6 |
+                            (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff) {
+        return result(error_code::OVERLONG, pos);
+      }
+      if (0x10ffff < code_point) {
+        return result(error_code::TOO_LARGE, pos);
+      }
+      *utf32_output++ = char32_t(code_point);
+      pos += 4;
+    } else {
+      // we either have too many continuation bytes or an invalid leading byte
+      if ((leading_byte & 0b11000000) == 0b10000000) {
+        return result(error_code::TOO_LONG, pos);
+      } else {
+        return result(error_code::HEADER_BITS, pos);
+      }
+    }
+  }
+  return result(error_code::SUCCESS, utf32_output - start);
+}
+
+/**
+ * When rewind_and_convert_with_errors is called, we are pointing at 'buf' and
+ * we have up to len input bytes left, and we encountered some error. It is
+ * possible that the error is at 'buf' exactly, but it could also be in the
+ * previous bytes location (up to 3 bytes back).
+ *
+ * prior_bytes indicates how many bytes, prior to 'buf' may belong to the
+ * current memory section and can be safely accessed. We prior_bytes to access
+ * safely up to three bytes before 'buf'.
+ *
+ * The caller is responsible to ensure that len > 0.
+ *
+ * If the error is believed to have occurred prior to 'buf', the count value
+ * contain in the result will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
+ */
+inline result rewind_and_convert_with_errors(size_t prior_bytes,
+                                             const char *buf, size_t len,
+                                             char32_t *utf32_output) {
+  size_t extra_len{0};
+  // We potentially need to go back in time and find a leading byte.
+  size_t how_far_back = 3; // 3 bytes in the past + current position
+  if (how_far_back > prior_bytes) {
+    how_far_back = prior_bytes;
+  }
+  bool found_leading_bytes{false};
+  // important: it is i <= how_far_back and not 'i < how_far_back'.
+  for (size_t i = 0; i <= how_far_back; i++) {
+    unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
+    found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
+    if (found_leading_bytes) {
+      if (i > 0 && byte < 128) {
+        // If we had to go back and the leading byte is ascii
+        // then we can stop right away.
+        return result(error_code::TOO_LONG, 0 - i + 1);
+      }
+      buf -= i;
+      extra_len = i;
+      break;
+    }
+  }
+  //
+  // It is possible for this function to return a negative count in its result.
+  // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
+  // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
+  // unsigned integral type of the result of the sizeof operator
+  //
+  // An unsigned type will simply wrap round arithmetically (well defined).
+  //
+  if (!found_leading_bytes) {
+    // If how_far_back == 3, we may have four consecutive continuation bytes!!!
+    // [....] [continuation] [continuation] [continuation] | [buf is
+    // continuation] Or we possibly have a stream that does not start with a
+    // leading byte.
+    return result(error_code::TOO_LONG, 0 - how_far_back);
+  }
+
+  result res = convert_with_errors(buf, len + extra_len, utf32_output);
+  if (res.error) {
+    res.count -= extra_len;
+  }
+  return res;
+}
+
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf8_to_utf32/utf8_to_utf32.h */
+
+/* begin file src/scalar/latin1_to_utf16/latin1_to_utf16.h */
+#ifndef SIMDUTF_LATIN1_TO_UTF16_H
+#define SIMDUTF_LATIN1_TO_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace latin1_to_utf16 {
+
+template <endianness big_endian>
+inline size_t convert(const char *buf, size_t len, char16_t *utf16_output) {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+
+  while (pos < len) {
+    uint16_t word =
+        uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
+    *utf16_output++ =
+        char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
+    pos++;
+  }
+
+  return utf16_output - start;
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char *buf, size_t len,
+                                  char16_t *utf16_output) {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+
+  while (pos < len) {
+    uint16_t word =
+        uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
+    *utf16_output++ =
+        char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
+    pos++;
+  }
+
+  return result(error_code::SUCCESS, utf16_output - start);
+}
+
+} // namespace latin1_to_utf16
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/latin1_to_utf16/latin1_to_utf16.h */
+/* begin file src/scalar/latin1_to_utf32/latin1_to_utf32.h */
+#ifndef SIMDUTF_LATIN1_TO_UTF32_H
+#define SIMDUTF_LATIN1_TO_UTF32_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace latin1_to_utf32 {
+
+inline size_t convert(const char *buf, size_t len, char32_t *utf32_output) {
+  const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
+  char32_t *start{utf32_output};
+  for (size_t i = 0; i < len; i++) {
+    *utf32_output++ = (char32_t)data[i];
+  }
+  return utf32_output - start;
+}
+
+} // namespace latin1_to_utf32
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/latin1_to_utf32/latin1_to_utf32.h */
+
+/* begin file src/scalar/utf8_to_latin1/utf8_to_latin1.h */
+#ifndef SIMDUTF_UTF8_TO_LATIN1_H
+#define SIMDUTF_UTF8_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_latin1 {
+
+inline size_t convert(const char *buf, size_t len, char *latin_output) {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char *start{latin_output};
+
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <=
+        len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000
+                           // 1000 1000 .... etc
+      if ((v & 0x8080808080808080) ==
+          0) { // if NONE of these are set, e.g. all of them are zero, then
+               // everything is ASCII
+        size_t final_pos = pos + 16;
+        while (pos < final_pos) {
+          *latin_output++ = char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+
+    // suppose it is not an all ASCII byte sequence
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *latin_output++ = char(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) ==
+               0b11000000) { // the first three bits indicate:
+      // We have a two-byte UTF-8
+      if (pos + 1 >= len) {
+        return 0;
+      } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return 0;
+      } // checks if the next byte is a valid continuation byte in UTF-8. A
+        // valid continuation byte starts with 10.
+      // range check -
+      uint32_t code_point =
+          (leading_byte & 0b00011111) << 6 |
+          (data[pos + 1] &
+           0b00111111); // assembles the Unicode code point from the two bytes.
+                        // It does this by discarding the leading 110 and 10
+                        // bits from the two bytes, shifting the remaining bits
+                        // of the first byte, and then combining the results
+                        // with a bitwise OR operation.
+      if (code_point < 0x80 || 0xFF < code_point) {
+        return 0; // We only care about the range 129-255 which is Non-ASCII
+                  // latin1 characters. A code_point beneath 0x80 is invalid as
+                  // it is already covered by bytes whose leading bit is zero.
+      }
+      *latin_output++ = char(code_point);
+      pos += 2;
+    } else {
+      return 0;
+    }
+  }
+  return latin_output - start;
+}
+
+inline result convert_with_errors(const char *buf, size_t len,
+                                  char *latin_output) {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char *start{latin_output};
+
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <=
+        len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000
+                           // 1000 1000...etc
+      if ((v & 0x8080808080808080) ==
+          0) { // if NONE of these are set, e.g. all of them are zero, then
+               // everything is ASCII
+        size_t final_pos = pos + 16;
+        while (pos < final_pos) {
+          *latin_output++ = char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    // suppose it is not an all ASCII byte sequence
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *latin_output++ = char(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) ==
+               0b11000000) { // the first three bits indicate:
+      // We have a two-byte UTF-8
+      if (pos + 1 >= len) {
+        return result(error_code::TOO_SHORT, pos);
+      } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      } // checks if the next byte is a valid continuation byte in UTF-8. A
+        // valid continuation byte starts with 10.
+      // range check -
+      uint32_t code_point =
+          (leading_byte & 0b00011111) << 6 |
+          (data[pos + 1] &
+           0b00111111); // assembles the Unicode code point from the two bytes.
+                        // It does this by discarding the leading 110 and 10
+                        // bits from the two bytes, shifting the remaining bits
+                        // of the first byte, and then combining the results
+                        // with a bitwise OR operation.
+      if (code_point < 0x80) {
+        return result(error_code::OVERLONG, pos);
+      }
+      if (0xFF < code_point) {
+        return result(error_code::TOO_LARGE, pos);
+      } // We only care about the range 129-255 which is Non-ASCII latin1
+        // characters
+      *latin_output++ = char(code_point);
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8
+      return result(error_code::TOO_LARGE, pos);
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      return result(error_code::TOO_LARGE, pos);
+    } else {
+      // we either have too many continuation bytes or an invalid leading byte
+      if ((leading_byte & 0b11000000) == 0b10000000) {
+        return result(error_code::TOO_LONG, pos);
+      }
+
+      return result(error_code::HEADER_BITS, pos);
+    }
+  }
+  return result(error_code::SUCCESS, latin_output - start);
+}
+
+inline result rewind_and_convert_with_errors(size_t prior_bytes,
+                                             const char *buf, size_t len,
+                                             char *latin1_output) {
+  size_t extra_len{0};
+  // We potentially need to go back in time and find a leading byte.
+  // In theory '3' would be sufficient, but sometimes the error can go back
+  // quite far.
+  size_t how_far_back = prior_bytes;
+  // size_t how_far_back = 3; // 3 bytes in the past + current position
+  // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
+  bool found_leading_bytes{false};
+  // important: it is i <= how_far_back and not 'i < how_far_back'.
+  for (size_t i = 0; i <= how_far_back; i++) {
+    unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
+    found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
+    if (found_leading_bytes) {
+      if (i > 0 && byte < 128) {
+        // If we had to go back and the leading byte is ascii
+        // then we can stop right away.
+        return result(error_code::TOO_LONG, 0 - i + 1);
+      }
+      buf -= i;
+      extra_len = i;
+      break;
+    }
+  }
+  //
+  // It is possible for this function to return a negative count in its result.
+  // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
+  // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
+  // unsigned integral type of the result of the sizeof operator
+  //
+  // An unsigned type will simply wrap round arithmetically (well defined).
+  //
+  if (!found_leading_bytes) {
+    // If how_far_back == 3, we may have four consecutive continuation bytes!!!
+    // [....] [continuation] [continuation] [continuation] | [buf is
+    // continuation] Or we possibly have a stream that does not start with a
+    // leading byte.
+    return result(error_code::TOO_LONG, 0 - how_far_back);
+  }
+  result res = convert_with_errors(buf, len + extra_len, latin1_output);
+  if (res.error) {
+    res.count -= extra_len;
+  }
+  return res;
+}
+
+} // namespace utf8_to_latin1
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf8_to_latin1/utf8_to_latin1.h */
+/* begin file src/scalar/utf16_to_latin1/utf16_to_latin1.h */
+#ifndef SIMDUTF_UTF16_TO_LATIN1_H
+#define SIMDUTF_UTF16_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_latin1 {
+
+#include <cstring> // for std::memcpy
+
+template <endianness big_endian>
+inline size_t convert(const char16_t *buf, size_t len, char *latin_output) {
+  if (len == 0) {
+    return 0;
+  }
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char *current_write = latin_output;
+  uint16_t word = 0;
+  uint16_t too_large = 0;
+
+  while (pos < len) {
+    word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    too_large |= word;
+    *current_write++ = char(word & 0xFF);
+    pos++;
+  }
+  if ((too_large & 0xFF00) != 0) {
+    return 0;
+  }
+
+  return current_write - latin_output;
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char16_t *buf, size_t len,
+                                  char *latin_output) {
+  if (len == 0) {
+    return result(error_code::SUCCESS, 0);
+  }
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char *start{latin_output};
+  uint16_t word;
+
+  while (pos < len) {
+    if (pos + 16 <= len) { // if it is safe to read 32 more bytes, check that
+                           // they are Latin1
+      uint64_t v1, v2, v3, v4;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      ::memcpy(&v2, data + pos + 4, sizeof(uint64_t));
+      ::memcpy(&v3, data + pos + 8, sizeof(uint64_t));
+      ::memcpy(&v4, data + pos + 12, sizeof(uint64_t));
+
+      if (!match_system(big_endian)) {
+        v1 = (v1 >> 8) | (v1 << (64 - 8));
+      }
+      if (!match_system(big_endian)) {
+        v2 = (v2 >> 8) | (v2 << (64 - 8));
+      }
+      if (!match_system(big_endian)) {
+        v3 = (v3 >> 8) | (v3 << (64 - 8));
+      }
+      if (!match_system(big_endian)) {
+        v4 = (v4 >> 8) | (v4 << (64 - 8));
+      }
+
+      if (((v1 | v2 | v3 | v4) & 0xFF00FF00FF00FF00) == 0) {
+        size_t final_pos = pos + 16;
+        while (pos < final_pos) {
+          *latin_output++ = !match_system(big_endian)
+                                ? char(utf16::swap_bytes(data[pos]))
+                                : char(data[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    if ((word & 0xFF00) == 0) {
+      *latin_output++ = char(word & 0xFF);
+      pos++;
+    } else {
+      return result(error_code::TOO_LARGE, pos);
+    }
+  }
+  return result(error_code::SUCCESS, latin_output - start);
+}
+
+} // namespace utf16_to_latin1
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf16_to_latin1/utf16_to_latin1.h */
+/* begin file src/scalar/utf32_to_latin1/utf32_to_latin1.h */
+#ifndef SIMDUTF_UTF32_TO_LATIN1_H
+#define SIMDUTF_UTF32_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_latin1 {
+
+inline size_t convert(const char32_t *buf, size_t len, char *latin1_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  char *start = latin1_output;
+  uint32_t utf32_char;
+  size_t pos = 0;
+  uint32_t too_large = 0;
+
+  while (pos < len) {
+    utf32_char = (uint32_t)data[pos];
+    too_large |= utf32_char;
+    *latin1_output++ = (char)(utf32_char & 0xFF);
+    pos++;
+  }
+  if ((too_large & 0xFFFFFF00) != 0) {
+    return 0;
+  }
+  return latin1_output - start;
+}
+
+inline result convert_with_errors(const char32_t *buf, size_t len,
+                                  char *latin1_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  char *start{latin1_output};
+  size_t pos = 0;
+  while (pos < len) {
+    if (pos + 2 <=
+        len) { // if it is safe to read 8 more bytes, check that they are Latin1
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0xFFFFFF00FFFFFF00) == 0) {
+        *latin1_output++ = char(buf[pos]);
+        *latin1_output++ = char(buf[pos + 1]);
+        pos += 2;
+        continue;
+      }
+    }
+    uint32_t utf32_char = data[pos];
+    if ((utf32_char & 0xFFFFFF00) ==
+        0) { // Check if the character can be represented in Latin-1
+      *latin1_output++ = (char)(utf32_char & 0xFF);
+      pos++;
+    } else {
+      return result(error_code::TOO_LARGE, pos);
+    };
+  }
+  return result(error_code::SUCCESS, latin1_output - start);
+}
+
+} // namespace utf32_to_latin1
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf32_to_latin1/utf32_to_latin1.h */
+
+/* begin file src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */
+#ifndef SIMDUTF_VALID_UTF8_TO_LATIN1_H
+#define SIMDUTF_VALID_UTF8_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_latin1 {
+
+inline size_t convert_valid(const char *buf, size_t len, char *latin_output) {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+
+  size_t pos = 0;
+  char *start{latin_output};
+
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <=
+        len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 |
+                 v2}; // We are only interested in these bits: 1000 1000 1000
+                      // 1000, so it makes sense to concatenate everything
+      if ((v & 0x8080808080808080) ==
+          0) { // if NONE of these are set, e.g. all of them are zero, then
+               // everything is ASCII
+        size_t final_pos = pos + 16;
+        while (pos < final_pos) {
+          *latin_output++ = char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+
+    // suppose it is not an all ASCII byte sequence
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *latin_output++ = char(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) ==
+               0b11000000) { // the first three bits indicate:
+      // We have a two-byte UTF-8
+      if (pos + 1 >= len) {
+        break;
+      } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return 0;
+      } // checks if the next byte is a valid continuation byte in UTF-8. A
+        // valid continuation byte starts with 10.
+      // range check -
+      uint32_t code_point =
+          (leading_byte & 0b00011111) << 6 |
+          (data[pos + 1] &
+           0b00111111); // assembles the Unicode code point from the two bytes.
+                        // It does this by discarding the leading 110 and 10
+                        // bits from the two bytes, shifting the remaining bits
+                        // of the first byte, and then combining the results
+                        // with a bitwise OR operation.
+      *latin_output++ = char(code_point);
+      pos += 2;
+    } else {
+      // we may have a continuation but we do not do error checking
+      return 0;
+    }
+  }
+  return latin_output - start;
+}
+
+} // namespace utf8_to_latin1
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */
+/* begin file src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */
+#ifndef SIMDUTF_VALID_UTF16_TO_LATIN1_H
+#define SIMDUTF_VALID_UTF16_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_latin1 {
+
+template <endianness big_endian>
+inline size_t convert_valid(const char16_t *buf, size_t len,
+                            char *latin_output) {
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char *start{latin_output};
+  uint16_t word = 0;
+
+  while (pos < len) {
+    word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    *latin_output++ = char(word);
+    pos++;
+  }
+
+  return latin_output - start;
+}
+
+} // namespace utf16_to_latin1
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */
+/* begin file src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */
+#ifndef SIMDUTF_VALID_UTF32_TO_LATIN1_H
+#define SIMDUTF_VALID_UTF32_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_latin1 {
+
+inline size_t convert_valid(const char32_t *buf, size_t len,
+                            char *latin1_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  char *start = latin1_output;
+  uint32_t utf32_char;
+  size_t pos = 0;
+
+  while (pos < len) {
+    utf32_char = (uint32_t)data[pos];
+
+    if (pos + 2 <=
+        len) { // if it is safe to read 8 more bytes, check that they are Latin1
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0xFFFFFF00FFFFFF00) == 0) {
+        *latin1_output++ = char(buf[pos]);
+        *latin1_output++ = char(buf[pos + 1]);
+        pos += 2;
+        continue;
+      } else {
+        // output can not be represented in latin1
+        return 0;
+      }
+    }
+    if ((utf32_char & 0xFFFFFF00) == 0) {
+      *latin1_output++ = char(utf32_char);
+    } else {
+      // output can not be represented in latin1
+      return 0;
+    }
+    pos++;
+  }
+  return latin1_output - start;
+}
+
+} // namespace utf32_to_latin1
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */
+
+SIMDUTF_PUSH_DISABLE_WARNINGS
+SIMDUTF_DISABLE_UNDESIRED_WARNINGS
+
+#if SIMDUTF_IMPLEMENTATION_ARM64
+/* begin file src/arm64/implementation.cpp */
+/* begin file src/simdutf/arm64/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "arm64"
+// #define SIMDUTF_IMPLEMENTATION arm64
+/* end file src/simdutf/arm64/begin.h */
+namespace simdutf {
+namespace arm64 {
+namespace {
+#ifndef SIMDUTF_ARM64_H
+  #error "arm64.h must be included"
+#endif
+using namespace simd;
+
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
+  simd8<uint8_t> bits = input.reduce_or();
+  return bits.max_val() < 0b10000000u;
+}
+
+simdutf_unused simdutf_really_inline simd8<bool>
+must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2,
+                     const simd8<uint8_t> prev3) {
+  simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
+  simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
+  simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+  // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller
+  // is using ^ as well. This will work fine because we only have to report
+  // errors for cases with 0-1 lead bytes. Multiple lead bytes implies 2
+  // overlapping multibyte characters, and if that happens, there is guaranteed
+  // to be at least *one* lead byte that is part of only 1 other multibyte
+  // character. The error will be detected there.
+  return is_second_byte ^ is_third_byte ^ is_fourth_byte;
+}
+
+simdutf_really_inline simd8<bool>
+must_be_2_3_continuation(const simd8<uint8_t> prev2,
+                         const simd8<uint8_t> prev3) {
+  simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
+  simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+  return is_third_byte ^ is_fourth_byte;
+}
+
+// common functions for utf8 conversions
+simdutf_really_inline uint16x4_t convert_utf8_3_byte_to_utf16(uint8x16_t in) {
+  // Low half contains  10cccccc|1110aaaa
+  // High half contains 10bbbbbb|10bbbbbb
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  const uint8x16_t sh = simdutf_make_uint8x16_t(0, 2, 3, 5, 6, 8, 9, 11, 1, 1,
+                                                4, 4, 7, 7, 10, 10);
+#else
+  const uint8x16_t sh = {0, 2, 3, 5, 6, 8, 9, 11, 1, 1, 4, 4, 7, 7, 10, 10};
+#endif
+  uint8x16_t perm = vqtbl1q_u8(in, sh);
+  // Split into half vectors.
+  // 10cccccc|1110aaaa
+  uint8x8_t perm_low = vget_low_u8(perm); // no-op
+  // 10bbbbbb|10bbbbbb
+  uint8x8_t perm_high = vget_high_u8(perm);
+  // xxxxxxxx 10bbbbbb
+  uint16x4_t mid = vreinterpret_u16_u8(perm_high); // no-op
+  // xxxxxxxx 1110aaaa
+  uint16x4_t high = vreinterpret_u16_u8(perm_low); // no-op
+  // Assemble with shift left insert.
+  // xxxxxxaa aabbbbbb
+  uint16x4_t mid_high = vsli_n_u16(mid, high, 6);
+  // (perm_low << 8) | (perm_low >> 8)
+  // xxxxxxxx 10cccccc
+  uint16x4_t low = vreinterpret_u16_u8(vrev16_u8(perm_low));
+  // Shift left insert into the low bits
+  // aaaabbbb bbcccccc
+  uint16x4_t composed = vsli_n_u16(low, mid_high, 6);
+  return composed;
+}
+
+simdutf_really_inline uint16x8_t convert_utf8_2_byte_to_utf16(uint8x16_t in) {
+  // Converts 6 2 byte UTF-8 characters to 6 UTF-16 characters.
+  // Technically this calculates 8, but 6 does better and happens more often
+  // (The languages which use these codepoints use ASCII spaces so 8 would need
+  // to be in the middle of a very long word).
+
+  // 10bbbbbb 110aaaaa
+  uint16x8_t upper = vreinterpretq_u16_u8(in);
+  // (in << 8) | (in >> 8)
+  // 110aaaaa 10bbbbbb
+  uint16x8_t lower = vreinterpretq_u16_u8(vrev16q_u8(in));
+  // 00000000 000aaaaa
+  uint16x8_t upper_masked = vandq_u16(upper, vmovq_n_u16(0x1F));
+  // Assemble with shift left insert.
+  // 00000aaa aabbbbbb
+  uint16x8_t composed = vsliq_n_u16(lower, upper_masked, 6);
+  return composed;
+}
+
+simdutf_really_inline uint16x8_t
+convert_utf8_1_to_2_byte_to_utf16(uint8x16_t in, size_t shufutf8_idx) {
+  // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters.
+  // This is a relatively easy scenario
+  // we process SIX (6) input code-code units. The max length in bytes of six
+  // code code units spanning between 1 and 2 bytes each is 12 bytes.
+  uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
+      simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx]));
+  // Shuffle
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 110aaaaa 10bbbbbb
+  uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh));
+  // Mask
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 00000000 00bbbbbb
+  uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits
+  // 1 byte: 00000000 00000000
+  // 2 byte: 000aaaaa 00000000
+  uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits
+  // Combine with a shift right accumulate
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 00000aaa aabbbbbb
+  uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2);
+  return composed;
+}
+
+/* begin file src/arm64/arm_validate_utf16.cpp */
+template <endianness big_endian>
+const char16_t *arm_validate_utf16(const char16_t *input, size_t size) {
+  const char16_t *end = input + size;
+  const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+  const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+  const auto v_fc = simd8<uint8_t>::splat(0xfc);
+  const auto v_dc = simd8<uint8_t>::splat(0xdc);
+  while (end - input >= 16) {
+    // 0. Load data: since the validation takes into account only higher
+    //    byte of each word, we compress the two vectors into one which
+    //    consists only the higher bytes.
+    auto in0 = simd16<uint16_t>(input);
+    auto in1 =
+        simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
+    if (!match_system(big_endian)) {
+      in0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in0)));
+      in1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in1)));
+    }
+    const auto t0 = in0.shr<8>();
+    const auto t1 = in1.shr<8>();
+    const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
+    // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+    const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
+    if (surrogates_wordmask == 0) {
+      input += 16;
+    } else {
+      // 2. We have some surrogates that have to be distinguished:
+      //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+      //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+      //
+      //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+      // V - non-surrogate code units
+      //     V = not surrogates_wordmask
+      const uint64_t V = ~surrogates_wordmask;
+
+      // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+      const auto vH = ((in & v_fc) == v_dc);
+      const uint64_t H = vH.to_bitmask64();
+
+      // L - word mask for low surrogates
+      //     L = not H and surrogates_wordmask
+      const uint64_t L = ~H & surrogates_wordmask;
+
+      const uint64_t a =
+          L & (H >> 4); // A low surrogate must be followed by high one.
+                        // (A low surrogate placed in the 7th register's word
+                        // is an exception we handle.)
+      const uint64_t b =
+          a << 4; // Just mark that the opposite fact is hold,
+                  // thanks to that we have only two masks for valid case.
+      const uint64_t c = V | a | b; // Combine all the masks into the final one.
+      if (c == ~0ull) {
+        // The whole input register contains valid UTF-16, i.e.,
+        // either single code units or proper surrogate pairs.
+        input += 16;
+      } else if (c == 0xfffffffffffffffull) {
+        // The 15 lower code units of the input register contains valid UTF-16.
+        // The 15th word may be either a low or high surrogate. It the next
+        // iteration we 1) check if the low surrogate is followed by a high
+        // one, 2) reject sole high surrogate.
+        input += 15;
+      } else {
+        return nullptr;
+      }
+    }
+  }
+  return input;
+}
+
+template <endianness big_endian>
+const result arm_validate_utf16_with_errors(const char16_t *input,
+                                            size_t size) {
+  const char16_t *start = input;
+  const char16_t *end = input + size;
+
+  const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+  const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+  const auto v_fc = simd8<uint8_t>::splat(0xfc);
+  const auto v_dc = simd8<uint8_t>::splat(0xdc);
+  while (input + 16 < end) {
+    // 0. Load data: since the validation takes into account only higher
+    //    byte of each word, we compress the two vectors into one which
+    //    consists only the higher bytes.
+    auto in0 = simd16<uint16_t>(input);
+    auto in1 =
+        simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
+
+    if (!match_system(big_endian)) {
+      in0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in0)));
+      in1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in1)));
+    }
+    const auto t0 = in0.shr<8>();
+    const auto t1 = in1.shr<8>();
+    const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
+    // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+    const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
+    if (surrogates_wordmask == 0) {
+      input += 16;
+    } else {
+      // 2. We have some surrogates that have to be distinguished:
+      //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+      //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+      //
+      //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+      // V - non-surrogate code units
+      //     V = not surrogates_wordmask
+      const uint64_t V = ~surrogates_wordmask;
+
+      // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+      const auto vH = ((in & v_fc) == v_dc);
+      const uint64_t H = vH.to_bitmask64();
+
+      // L - word mask for low surrogates
+      //     L = not H and surrogates_wordmask
+      const uint64_t L = ~H & surrogates_wordmask;
+
+      const uint64_t a =
+          L & (H >> 4); // A low surrogate must be followed by high one.
+                        // (A low surrogate placed in the 7th register's word
+                        // is an exception we handle.)
+      const uint64_t b =
+          a << 4; // Just mark that the opposite fact is hold,
+                  // thanks to that we have only two masks for valid case.
+      const uint64_t c = V | a | b; // Combine all the masks into the final one.
+      if (c == ~0ull) {
+        // The whole input register contains valid UTF-16, i.e.,
+        // either single code units or proper surrogate pairs.
+        input += 16;
+      } else if (c == 0xfffffffffffffffull) {
+        // The 15 lower code units of the input register contains valid UTF-16.
+        // The 15th word may be either a low or high surrogate. It the next
+        // iteration we 1) check if the low surrogate is followed by a high
+        // one, 2) reject sole high surrogate.
+        input += 15;
+      } else {
+        return result(error_code::SURROGATE, input - start);
+      }
+    }
+  }
+  return result(error_code::SUCCESS, input - start);
+}
+/* end file src/arm64/arm_validate_utf16.cpp */
+/* begin file src/arm64/arm_validate_utf32le.cpp */
+
+const char32_t *arm_validate_utf32le(const char32_t *input, size_t size) {
+  const char32_t *end = input + size;
+
+  const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
+  const uint32x4_t offset = vmovq_n_u32(0xffff2000);
+  const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
+  uint32x4_t currentmax = vmovq_n_u32(0x0);
+  uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
+
+  while (end - input >= 4) {
+    const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input));
+    currentmax = vmaxq_u32(in, currentmax);
+    currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
+    input += 4;
+  }
+
+  uint32x4_t is_zero =
+      veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
+  if (vmaxvq_u32(is_zero) != 0) {
+    return nullptr;
+  }
+
+  is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax),
+                      standardoffsetmax);
+  if (vmaxvq_u32(is_zero) != 0) {
+    return nullptr;
+  }
+
+  return input;
+}
+
+const result arm_validate_utf32le_with_errors(const char32_t *input,
+                                              size_t size) {
+  const char32_t *start = input;
+  const char32_t *end = input + size;
+
+  const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
+  const uint32x4_t offset = vmovq_n_u32(0xffff2000);
+  const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
+  uint32x4_t currentmax = vmovq_n_u32(0x0);
+  uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
+
+  while (end - input >= 4) {
+    const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input));
+    currentmax = vmaxq_u32(in, currentmax);
+    currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
+
+    uint32x4_t is_zero =
+        veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
+    if (vmaxvq_u32(is_zero) != 0) {
+      return result(error_code::TOO_LARGE, input - start);
+    }
+
+    is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax),
+                        standardoffsetmax);
+    if (vmaxvq_u32(is_zero) != 0) {
+      return result(error_code::SURROGATE, input - start);
+    }
+
+    input += 4;
+  }
+
+  return result(error_code::SUCCESS, input - start);
+}
+/* end file src/arm64/arm_validate_utf32le.cpp */
+
+/* begin file src/arm64/arm_convert_latin1_to_utf16.cpp */
+template <endianness big_endian>
+std::pair<const char *, char16_t *>
+arm_convert_latin1_to_utf16(const char *buf, size_t len,
+                            char16_t *utf16_output) {
+  const char *end = buf + len;
+
+  while (end - buf >= 16) {
+    uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(buf));
+    uint16x8_t inlow = vmovl_u8(vget_low_u8(in8));
+    if (!match_system(big_endian)) {
+      inlow = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(inlow)));
+    }
+    vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), inlow);
+    uint16x8_t inhigh = vmovl_u8(vget_high_u8(in8));
+    if (!match_system(big_endian)) {
+      inhigh = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(inhigh)));
+    }
+    vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output + 8), inhigh);
+    utf16_output += 16;
+    buf += 16;
+  }
+
+  return std::make_pair(buf, utf16_output);
+}
+/* end file src/arm64/arm_convert_latin1_to_utf16.cpp */
+/* begin file src/arm64/arm_convert_latin1_to_utf32.cpp */
+std::pair<const char *, char32_t *>
+arm_convert_latin1_to_utf32(const char *buf, size_t len,
+                            char32_t *utf32_output) {
+  const char *end = buf + len;
+
+  while (end - buf >= 16) {
+    uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(buf));
+    uint16x8_t in8low = vmovl_u8(vget_low_u8(in8));
+    uint32x4_t in16lowlow = vmovl_u16(vget_low_u16(in8low));
+    uint32x4_t in16lowhigh = vmovl_u16(vget_high_u16(in8low));
+    uint16x8_t in8high = vmovl_u8(vget_high_u8(in8));
+    uint32x4_t in8highlow = vmovl_u16(vget_low_u16(in8high));
+    uint32x4_t in8highhigh = vmovl_u16(vget_high_u16(in8high));
+    vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output), in16lowlow);
+    vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output + 4), in16lowhigh);
+    vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output + 8), in8highlow);
+    vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output + 12), in8highhigh);
+
+    utf32_output += 16;
+    buf += 16;
+  }
+
+  return std::make_pair(buf, utf32_output);
+}
+/* end file src/arm64/arm_convert_latin1_to_utf32.cpp */
+/* begin file src/arm64/arm_convert_latin1_to_utf8.cpp */
+/*
+  Returns a pair: the first unprocessed byte from buf and utf8_output
+  A scalar routing should carry on the conversion of the tail.
+*/
+std::pair<const char *, char *>
+arm_convert_latin1_to_utf8(const char *latin1_input, size_t len,
+                           char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+  const char *end = latin1_input + len;
+  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+  // We always write 16 bytes, of which more than the first 8 bytes
+  // are valid. A safety margin of 8 is more than sufficient.
+  while (end - latin1_input >= 16 + 8) {
+    uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(latin1_input));
+    if (vmaxvq_u8(in8) <= 0x7F) { // ASCII fast path!!!!
+      vst1q_u8(utf8_output, in8);
+      utf8_output += 16;
+      latin1_input += 16;
+      continue;
+    }
+
+    // We just fallback on UTF-16 code. This could be optimized/simplified
+    // further.
+    uint16x8_t in16 = vmovl_u8(vget_low_u8(in8));
+    // 1. prepare 2-byte values
+    // input 8-bit word : [aabb|bbbb] x 8
+    // expected output   : [1100|00aa|10bb|bbbb] x 8
+    const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+    const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+    // t0 = [0000|00aa|bbbb|bb00]
+    const uint16x8_t t0 = vshlq_n_u16(in16, 2);
+    // t1 = [0000|00aa|0000|0000]
+    const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+    // t2 = [0000|0000|00bb|bbbb]
+    const uint16x8_t t2 = vandq_u16(in16, v_003f);
+    // t3 = [0000|00aa|00bb|bbbb]
+    const uint16x8_t t3 = vorrq_u16(t1, t2);
+    // t4 = [1100|00aa|10bb|bbbb]
+    const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+    // 2. merge ASCII and 2-byte codewords
+    const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+    const uint16x8_t one_byte_bytemask = vcleq_u16(in16, v_007f);
+    const uint8x16_t utf8_unpacked =
+        vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in16, t4));
+    // 3. prepare bitmask for 8-bit lookup
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+    const uint16x8_t mask = simdutf_make_uint16x8_t(
+        0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
+#else
+    const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
+                             0x0002, 0x0008, 0x0020, 0x0080};
+#endif
+    uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+    // 4. pack the bytes
+    const uint8_t *row =
+        &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+    const uint8x16_t shuffle = vld1q_u8(row + 1);
+    const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+
+    // 5. store bytes
+    vst1q_u8(utf8_output, utf8_packed);
+    // 6. adjust pointers
+    latin1_input += 8;
+    utf8_output += row[0];
+
+  } // while
+
+  return std::make_pair(latin1_input, reinterpret_cast<char *>(utf8_output));
+}
+/* end file src/arm64/arm_convert_latin1_to_utf8.cpp */
+
+/* begin file src/arm64/arm_convert_utf8_to_latin1.cpp */
+// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 16, usually 12).
+size_t convert_masked_utf8_to_latin1(const char *input,
+                                     uint64_t utf8_end_of_code_point_mask,
+                                     char *&latin1_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t *>(input));
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xfff;
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it
+  // is maybe beneficial to have fast paths that depend on branch prediction but
+  // have less latency. This results in more instructions but, potentially, also
+  // higher speeds.
+
+  // We first try a few fast paths.
+  // The obvious first test is ASCII, which actually consumes the full 16.
+  if (utf8_end_of_code_point_mask == 0xfff) {
+    // We process in chunks of 12 bytes
+    vst1q_u8(reinterpret_cast<uint8_t *>(latin1_output), in);
+    latin1_output += 12; // We wrote 12 18-bit characters.
+    return 12;           // We consumed 12 bytes.
+  }
+  /// We do not have a fast path available, or the fast path is unimportant, so
+  /// we fallback.
+  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][0];
+
+  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][1];
+  // this indicates an invalid input:
+  if (idx >= 64) {
+    return consumed;
+  }
+  // Here we should have (idx < 64), if not, there is a bug in the validation or
+  // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
+  // we process SIX (6) input code-code units. The max length in bytes of six
+  // code code units spanning between 1 and 2 bytes each is 12 bytes. Converts 6
+  // 1-2 byte UTF-8 characters to 6 UTF-16 characters. This is a relatively easy
+  // scenario we process SIX (6) input code-code units. The max length in bytes
+  // of six code code units spanning between 1 and 2 bytes each is 12 bytes.
+  uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
+      simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+  // Shuffle
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 110aaaaa 10bbbbbb
+  uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh));
+  // Mask
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 00000000 00bbbbbb
+  uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits
+  // 1 byte: 00000000 00000000
+  // 2 byte: 000aaaaa 00000000
+  uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits
+  // Combine with a shift right accumulate
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 00000aaa aabbbbbb
+  uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2);
+  // writing 8 bytes even though we only care about the first 6 bytes.
+  uint8x8_t latin1_packed = vmovn_u16(composed);
+  vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
+  latin1_output += 6; // We wrote 6 bytes.
+  return consumed;
+}
+/* end file src/arm64/arm_convert_utf8_to_latin1.cpp */
+/* begin file src/arm64/arm_convert_utf8_to_utf16.cpp */
+// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 16, usually 12).
+template <endianness big_endian>
+size_t convert_masked_utf8_to_utf16(const char *input,
+                                    uint64_t utf8_end_of_code_point_mask,
+                                    char16_t *&utf16_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t *>(input));
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xfff;
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it
+  // is maybe beneficial to have fast paths that depend on branch prediction but
+  // have less latency. This results in more instructions but, potentially, also
+  // higher speeds.
+
+  // We first try a few fast paths.
+  // The obvious first test is ASCII, which actually consumes the full 16.
+  if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xffff) {
+    // We process in chunks of 16 bytes
+    // The routine in simd.h is reused.
+    simd8<int8_t> temp{vreinterpretq_s8_u8(in)};
+    temp.store_ascii_as_utf16<big_endian>(utf16_output);
+    utf16_output += 16; // We wrote 16 16-bit characters.
+    return 16;          // We consumed 16 bytes.
+  }
+
+  // 3 byte sequences are the next most common, as seen in CJK, which has long
+  // sequences of these.
+  if (input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
+    // UTF-16 code units.
+    uint16x4_t composed = convert_utf8_3_byte_to_utf16(in);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed)));
+    }
+    vst1_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
+    utf16_output += 4; // We wrote 4 16-bit characters.
+    return 12;         // We consumed 12 bytes.
+  }
+
+  // 2 byte sequences occur in short bursts in languages like Greek and Russian.
+  if ((utf8_end_of_code_point_mask & 0xFFF) == 0xaaa) {
+    // We want to take 6 2-byte UTF-8 code units and turn them into 6 2-byte
+    // UTF-16 code units.
+    uint16x8_t composed = convert_utf8_2_byte_to_utf16(in);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed =
+          vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
+    }
+    vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
+
+    utf16_output += 6; // We wrote 6 16-bit characters.
+    return 12;         // We consumed 12 bytes.
+  }
+
+  /// We do not have a fast path available, or the fast path is unimportant, so
+  /// we fallback.
+  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][0];
+
+  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][1];
+
+  if (idx < 64) {
+    // SIX (6) input code-code units
+    // Convert to UTF-16
+    uint16x8_t composed = convert_utf8_1_to_2_byte_to_utf16(in, idx);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed =
+          vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
+    }
+    // Store
+    vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
+    utf16_output += 6; // We wrote 6 16-bit characters.
+    return consumed;
+  } else if (idx < 145) {
+    // FOUR (4) input code-code units
+    // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
+    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
+        simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+    // XXX: depending on the system scalar instructions might be faster.
+    // 1 byte: 00000000 00000000 0ccccccc
+    // 2 byte: 00000000 110bbbbb 10cccccc
+    // 3 byte: 1110aaaa 10bbbbbb 10cccccc
+    uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
+    // 1 byte: 00000000 0ccccccc
+    // 2 byte: xx0bbbbb x0cccccc
+    // 3 byte: xxbbbbbb x0cccccc
+    uint16x4_t lowperm = vmovn_u32(perm);
+    // Partially mask with bic (doesn't require a temporary register unlike and)
+    // The shift left insert below will clear the top bits.
+    // 1 byte: 00000000 00000000
+    // 2 byte: xx0bbbbb 00000000
+    // 3 byte: xxbbbbbb 00000000
+    uint16x4_t middlebyte = vbic_u16(lowperm, vmov_n_u16(uint16_t(~0xFF00)));
+    // ASCII
+    // 1 byte: 00000000 0ccccccc
+    // 2+byte: 00000000 00cccccc
+    uint16x4_t ascii = vand_u16(lowperm, vmov_n_u16(0x7F));
+    // Split into narrow vectors.
+    // 2 byte: 00000000 00000000
+    // 3 byte: 00000000 xxxxaaaa
+    uint16x4_t highperm = vshrn_n_u32(perm, 16);
+    // Shift right accumulate the middle byte
+    // 1 byte: 00000000 0ccccccc
+    // 2 byte: 00xx0bbb bbcccccc
+    // 3 byte: 00xxbbbb bbcccccc
+    uint16x4_t middlelow = vsra_n_u16(ascii, middlebyte, 2);
+    // Shift left and insert the top 4 bits, overwriting the garbage
+    // 1 byte: 00000000 0ccccccc
+    // 2 byte: 00000bbb bbcccccc
+    // 3 byte: aaaabbbb bbcccccc
+    uint16x4_t composed = vsli_n_u16(middlelow, highperm, 12);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed)));
+    }
+    vst1_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
+
+    utf16_output += 4; // We wrote 4 16-bit codepoints
+    return consumed;
+  } else if (idx < 209) {
+    // THREE (3) input code-code units
+    if (input_utf8_end_of_code_point_mask == 0x888) {
+      // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
+      // UTF-16 pairs. Generating surrogate pairs is a little tricky though, but
+      // it is easier when we can assume they are all pairs. This version does
+      // not use the LUT, but 4 byte sequences are less common and the overhead
+      // of the extra memory access is less important than the early branch
+      // overhead in shorter sequences.
+
+      // Swap byte pairs
+      // 10dddddd 10cccccc|10bbbbbb 11110aaa
+      // 10cccccc 10dddddd|11110aaa 10bbbbbb
+      uint8x16_t swap = vrev16q_u8(in);
+      // Shift left 2 bits
+      // cccccc00 dddddd00 xxxxxxxx bbbbbb00
+      uint32x4_t shift = vreinterpretq_u32_u8(vshlq_n_u8(swap, 2));
+      // Create a magic number containing the low 2 bits of the trail surrogate
+      // and all the corrections needed to create the pair. UTF-8 4b prefix   =
+      // -0x0000|0xF000 surrogate offset  = -0x0000|0x0040 (0x10000 << 6)
+      // surrogate high    = +0x0000|0xD800
+      // surrogate low     = +0xDC00|0x0000
+      // -------------------------------
+      //                   = +0xDC00|0xE7C0
+      uint32x4_t magic = vmovq_n_u32(0xDC00E7C0);
+      // Generate unadjusted trail surrogate minus lowest 2 bits
+      // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00
+      uint32x4_t trail =
+          vbslq_u32(vmovq_n_u32(0x0000FF00), vreinterpretq_u32_u8(swap), shift);
+      // Insert low 2 bits of trail surrogate to magic number for later
+      // 11011100 00000000 11100111 110000cc
+      uint16x8_t magic_with_low_2 =
+          vreinterpretq_u16_u32(vsraq_n_u32(magic, shift, 30));
+      // Generate lead surrogate
+      // xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx
+      uint32x4_t lead = vreinterpretq_u32_u16(
+          vsliq_n_u16(vreinterpretq_u16_u8(swap), vreinterpretq_u16_u8(in), 6));
+      // Mask out lead
+      // 000000cc ccdddddd|xxxxxxxx xxxxxxxx
+      lead = vbicq_u32(lead, vmovq_n_u32(uint32_t(~0x03FFFFFF)));
+      // Blend pairs
+      // 000000cc ccdddddd|11110aaa bbbbbb00
+      uint16x8_t blend = vreinterpretq_u16_u32(
+          vbslq_u32(vmovq_n_u32(0x0000FFFF), trail, lead));
+      // Add magic number to finish the result
+      // 110111CC CCDDDDDD|110110AA BBBBBBCC
+      uint16x8_t composed = vaddq_u16(blend, magic_with_low_2);
+      // Byte swap if necessary
+      if (!match_system(big_endian)) {
+        composed =
+            vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
+      }
+      uint16_t buffer[8];
+      vst1q_u16(reinterpret_cast<uint16_t *>(buffer), composed);
+      for (int k = 0; k < 6; k++) {
+        utf16_output[k] = buffer[k];
+      } // the loop might compiler to a couple of instructions.
+      utf16_output += 6; // We wrote 3 32-bit surrogate pairs.
+      return 12;         // We consumed 12 bytes.
+    }
+    // 3 1-4 byte sequences
+    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
+        simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+
+    // 1 byte: 00000000 00000000 00000000 0ddddddd
+    // 3 byte: 00000000 00000000 110ccccc 10dddddd
+    // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
+    // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
+    uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
+    // added to fix issue https://github.com/simdutf/simdutf/issues/514
+    // We only want to write 2 * 16-bit code units when that is actually what we
+    // have. Unfortunately, we cannot trust the input. So it is possible to get
+    // 0xff as an input byte and it should not result in a surrogate pair. We
+    // need to check for that.
+    uint32_t permbuffer[4];
+    vst1q_u32(permbuffer, perm);
+    // Mask the low and middle bytes
+    // 00000000 00000000 00000000 0ddddddd
+    uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7f));
+    // Because the surrogates need more work, the high surrogate is computed
+    // first.
+    uint32x4_t middlehigh = vshlq_n_u32(perm, 2);
+    // 00000000 00000000 00cccccc 00000000
+    uint32x4_t middlebyte = vandq_u32(perm, vmovq_n_u32(0x3F00));
+    // Start assembling the sequence. Since the 4th byte is in the same position
+    // as it would be in a surrogate and there is no dependency, shift left
+    // instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte:
+    // 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx
+    uint32x4_t ab = vbslq_u32(vmovq_n_u32(0xFF000000), perm, middlehigh);
+    // Top 16 bits contains the high ten bits of the surrogate pair before
+    // correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa
+    // bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction
+    uint32x4_t abc =
+        vbslq_u32(vmovq_n_u32(0xFFFC0000), ab, vshlq_n_u32(middlebyte, 4));
+    // Combine the low 6 or 7 bits by a shift right accumulate
+    // 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct
+    // 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o
+    // correction
+    uint32x4_t composed = vsraq_n_u32(ascii, abc, 6);
+    // After this is for surrogates
+    // Blend the low and high surrogates
+    // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd
+    uint32x4_t mixed = vbslq_u32(vmovq_n_u32(0xFFFF0000), abc, composed);
+    // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits
+    // yet as 0x10000 was not subtracted from the codepoint yet. 4 byte:
+    // 11110aaa bbbbbbcc|000000cc ccdddddd
+    uint16x8_t masked_pair = vreinterpretq_u16_u32(
+        vbicq_u32(mixed, vmovq_n_u32(uint32_t(~0xFFFF03FF))));
+    // Correct the remaining UTF-8 prefix, surrogate offset, and add the
+    // surrogate prefixes in one magic 16-bit addition. similar magic number but
+    // without the continue byte adjust and halfword swapped UTF-8 4b prefix   =
+    // -0xF000|0x0000 surrogate offset  = -0x0040|0x0000 (0x10000 << 6)
+    // surrogate high    = +0xD800|0x0000
+    // surrogate low     = +0x0000|0xDC00
+    // -----------------------------------
+    //                   = +0xE7C0|0xDC00
+    uint16x8_t magic = vreinterpretq_u16_u32(vmovq_n_u32(0xE7C0DC00));
+    // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete
+    uint32x4_t surrogates =
+        vreinterpretq_u32_u16(vaddq_u16(masked_pair, magic));
+    // If the high bit is 1 (s32 less than zero), this needs a surrogate pair
+    uint32x4_t is_pair = vcltzq_s32(vreinterpretq_s32_u32(perm));
+
+    // Select either the 4 byte surrogate pair or the 2 byte solo codepoint
+    // 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd
+    // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD
+    uint32x4_t selected = vbslq_u32(is_pair, surrogates, composed);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      selected =
+          vreinterpretq_u32_u8(vrev16q_u8(vreinterpretq_u8_u32(selected)));
+    }
+    // Attempting to shuffle and store would be complex, just scalarize.
+    uint32_t buffer[4];
+    vst1q_u32(buffer, selected);
+    // Test for the top bit of the surrogate mask. Remove due to issue 514
+    // const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 :
+    // 0x00800000;
+    for (size_t i = 0; i < 3; i++) {
+      // Surrogate
+      // Used to be if (buffer[i] & SURROGATE_MASK) {
+      // See discussion above.
+      // patch for issue https://github.com/simdutf/simdutf/issues/514
+      if ((permbuffer[i] & 0xf8000000) == 0xf0000000) {
+        utf16_output[0] = uint16_t(buffer[i] >> 16);
+        utf16_output[1] = uint16_t(buffer[i] & 0xFFFF);
+        utf16_output += 2;
+      } else {
+        utf16_output[0] = uint16_t(buffer[i] & 0xFFFF);
+        utf16_output++;
+      }
+    }
+    return consumed;
+  } else {
+    // here we know that there is an error but we do not handle errors
+    return 12;
+  }
+}
+/* end file src/arm64/arm_convert_utf8_to_utf16.cpp */
+/* begin file src/arm64/arm_convert_utf8_to_utf32.cpp */
+// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+size_t convert_masked_utf8_to_utf32(const char *input,
+                                    uint64_t utf8_end_of_code_point_mask,
+                                    char32_t *&utf32_out) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  uint32_t *&utf32_output = reinterpret_cast<uint32_t *&>(utf32_out);
+  uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t *>(input));
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xFFF;
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it
+  // is maybe beneficial to have fast paths that depend on branch prediction but
+  // have less latency. This results in more instructions but, potentially, also
+  // higher speeds.
+  //
+  // We first try a few fast paths.
+  if (utf8_end_of_code_point_mask == 0xfff) {
+    // We process in chunks of 12 bytes.
+    // use fast implementation in src/simdutf/arm64/simd.h
+    // Ideally the compiler can keep the tables in registers.
+    simd8<int8_t> temp{vreinterpretq_s8_u8(in)};
+    temp.store_ascii_as_utf32_tbl(utf32_out);
+    utf32_output += 12; // We wrote 12 32-bit characters.
+    return 12;          // We consumed 12 bytes.
+  }
+  if (input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
+    // UTF-32 code units. Convert to UTF-16
+    uint16x4_t composed_utf16 = convert_utf8_3_byte_to_utf16(in);
+    // Zero extend and store via ST2 with a zero.
+    uint16x4x2_t interleaver = {{composed_utf16, vmov_n_u16(0)}};
+    vst2_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
+    utf32_output += 4; // We wrote 4 32-bit characters.
+    return 12;         // We consumed 12 bytes.
+  }
+
+  // 2 byte sequences occur in short bursts in languages like Greek and Russian.
+  if (input_utf8_end_of_code_point_mask == 0xaaa) {
+    // We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte
+    // UTF-32 code units. Convert to UTF-16
+    uint16x8_t composed_utf16 = convert_utf8_2_byte_to_utf16(in);
+    // Zero extend and store via ST2 with a zero.
+    uint16x8x2_t interleaver = {{composed_utf16, vmovq_n_u16(0)}};
+    vst2q_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
+    utf32_output += 6; // We wrote 6 32-bit characters.
+    return 12;         // We consumed 12 bytes.
+  }
+  /// Either no fast path or an unimportant fast path.
+
+  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][1];
+
+  if (idx < 64) {
+    // SIX (6) input code-code units
+    // Convert to UTF-16
+    uint16x8_t composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx);
+    // Zero extend and store with ST2 and zero
+    uint16x8x2_t interleaver = {{composed_utf16, vmovq_n_u16(0)}};
+    vst2q_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
+    utf32_output += 6; // We wrote 6 32-bit characters.
+    return consumed;
+  } else if (idx < 145) {
+    // FOUR (4) input code-code units
+    // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
+    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
+        simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+    // Shuffle
+    // 1 byte: 00000000 00000000 0ccccccc
+    // 2 byte: 00000000 110bbbbb 10cccccc
+    // 3 byte: 1110aaaa 10bbbbbb 10cccccc
+    uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
+    // Split
+    // 00000000 00000000 0ccccccc
+    uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F)); // 6 or 7 bits
+    // Note: unmasked
+    // xxxxxxxx aaaaxxxx xxxxxxxx
+    uint32x4_t high = vshrq_n_u32(perm, 4); // 4 bits
+    // Use 16 bit bic instead of and.
+    // The top bits will be corrected later in the bsl
+    // 00000000 10bbbbbb 00000000
+    uint32x4_t middle = vreinterpretq_u32_u16(
+        vbicq_u16(vreinterpretq_u16_u32(perm),
+                  vmovq_n_u16(uint16_t(~0xff00)))); // 5 or 6 bits
+    // Combine low and middle with shift right accumulate
+    // 00000000 00xxbbbb bbcccccc
+    uint32x4_t lowmid = vsraq_n_u32(ascii, middle, 2);
+    // Insert top 4 bits from high byte with bitwise select
+    // 00000000 aaaabbbb bbcccccc
+    uint32x4_t composed = vbslq_u32(vmovq_n_u32(0x0000F000), high, lowmid);
+    vst1q_u32(utf32_output, composed);
+    utf32_output += 4; // We wrote 4 32-bit characters.
+    return consumed;
+  } else if (idx < 209) {
+    // THREE (3) input code-code units
+    if (input_utf8_end_of_code_point_mask == 0x888) {
+      // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
+      // UTF-32 code units. This uses the same method as the fixed 3 byte
+      // version, reversing and shift left insert. However, there is no need for
+      // a shuffle mask now, just rev16 and rev32.
+      //
+      // This version does not use the LUT, but 4 byte sequences are less common
+      // and the overhead of the extra memory access is less important than the
+      // early branch overhead in shorter sequences, so it comes last.
+
+      // Swap pairs of bytes
+      // 10dddddd|10cccccc|10bbbbbb|11110aaa
+      // 10cccccc 10dddddd|11110aaa 10bbbbbb
+      uint16x8_t swap1 = vreinterpretq_u16_u8(vrev16q_u8(in));
+      // Shift left and insert
+      // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb
+      uint16x8_t merge1 = vsliq_n_u16(swap1, vreinterpretq_u16_u8(in), 6);
+      // Swap 16-bit lanes
+      // xxxxcccc ccdddddd xxxxxxxa aabbbbbb
+      // xxxxxxxa aabbbbbb xxxxcccc ccdddddd
+      uint32x4_t swap2 = vreinterpretq_u32_u16(vrev32q_u16(merge1));
+      // Shift insert again
+      // xxxxxxxx xxxaaabb bbbbcccc ccdddddd
+      uint32x4_t merge2 = vsliq_n_u32(swap2, vreinterpretq_u32_u16(merge1), 12);
+      // Clear the garbage
+      // 00000000 000aaabb bbbbcccc ccdddddd
+      uint32x4_t composed = vandq_u32(merge2, vmovq_n_u32(0x1FFFFF));
+      // Store
+      vst1q_u32(utf32_output, composed);
+
+      utf32_output += 3; // We wrote 3 32-bit characters.
+      return 12;         // We consumed 12 bytes.
+    }
+    // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit
+    // due to surrogates no longer being involved.
+    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
+        simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+    // 1 byte: 00000000 00000000 00000000 0ddddddd
+    // 2 byte: 00000000 00000000 110ccccc 10dddddd
+    // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
+    // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
+    uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
+    // Ascii
+    uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F));
+    uint32x4_t middle = vandq_u32(perm, vmovq_n_u32(0x3f00));
+    // When converting the way we do, the 3 byte prefix will be interpreted as
+    // the 18th bit being set, since the code would interpret the lead byte
+    // (0b1110bbbb) as a continuation byte (0b10bbbbbb). To fix this, we can
+    // either xor or do an 8 bit add of the 6th bit shifted right by 1. Since
+    // NEON has shift right accumulate, we use that.
+    //  4 byte   3 byte
+    // 10bbbbbb 1110bbbb
+    // 00000000 01000000 6th bit
+    // 00000000 00100000 shift right
+    // 10bbbbbb 0000bbbb add
+    // 00bbbbbb 0000bbbb mask
+    uint8x16_t correction =
+        vreinterpretq_u8_u32(vandq_u32(perm, vmovq_n_u32(0x00400000)));
+    uint32x4_t corrected = vreinterpretq_u32_u8(
+        vsraq_n_u8(vreinterpretq_u8_u32(perm), correction, 1));
+    // 00000000 00000000 0000cccc ccdddddd
+    uint32x4_t cd = vsraq_n_u32(ascii, middle, 2);
+    // Insert twice
+    // xxxxxxxx xxxaaabb bbbbxxxx xxxxxxxx
+    uint32x4_t ab = vbslq_u32(vmovq_n_u32(0x01C0000), vshrq_n_u32(corrected, 6),
+                              vshrq_n_u32(corrected, 4));
+    // 00000000 000aaabb bbbbcccc ccdddddd
+    uint32x4_t composed = vbslq_u32(vmovq_n_u32(0xFFE00FFF), cd, ab);
+    // Store
+    vst1q_u32(utf32_output, composed);
+    utf32_output += 3; // We wrote 3 32-bit characters.
+    return consumed;
+  } else {
+    // here we know that there is an error but we do not handle errors
+    return 12;
+  }
+}
+/* end file src/arm64/arm_convert_utf8_to_utf32.cpp */
+
+/* begin file src/arm64/arm_convert_utf16_to_latin1.cpp */
+
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+arm_convert_utf16_to_latin1(const char16_t *buf, size_t len,
+                            char *latin1_output) {
+  const char16_t *end = buf + len;
+  while (end - buf >= 8) {
+    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+    if (!match_system(big_endian)) {
+      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
+    }
+    if (vmaxvq_u16(in) <= 0xff) {
+      // 1. pack the bytes
+      uint8x8_t latin1_packed = vmovn_u16(in);
+      // 2. store (8 bytes)
+      vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
+      // 3. adjust pointers
+      buf += 8;
+      latin1_output += 8;
+    } else {
+      return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
+    }
+  } // while
+  return std::make_pair(buf, latin1_output);
+}
+
+template <endianness big_endian>
+std::pair<result, char *>
+arm_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
+                                        char *latin1_output) {
+  const char16_t *start = buf;
+  const char16_t *end = buf + len;
+  while (end - buf >= 8) {
+    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+    if (!match_system(big_endian)) {
+      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
+    }
+    if (vmaxvq_u16(in) <= 0xff) {
+      // 1. pack the bytes
+      uint8x8_t latin1_packed = vmovn_u16(in);
+      // 2. store (8 bytes)
+      vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
+      // 3. adjust pointers
+      buf += 8;
+      latin1_output += 8;
+    } else {
+      // Let us do a scalar fallback.
+      for (int k = 0; k < 8; k++) {
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
+        if (word <= 0xff) {
+          *latin1_output++ = char(word);
+        } else {
+          return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
+                                latin1_output);
+        }
+      }
+    }
+  } // while
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        latin1_output);
+}
+/* end file src/arm64/arm_convert_utf16_to_latin1.cpp */
+/* begin file src/arm64/arm_convert_utf16_to_utf32.cpp */
+/*
+    The vectorized algorithm works on single SSE register i.e., it
+    loads eight 16-bit code units.
+
+    We consider three cases:
+    1. an input register contains no surrogates and each value
+       is in range 0x0000 .. 0x07ff.
+    2. an input register contains no surrogates and values are
+       is in range 0x0000 .. 0xffff.
+    3. an input register contains surrogates --- i.e. codepoints
+       can have 16 or 32 bits.
+
+    Ad 1.
+
+    When values are less than 0x0800, it means that a 16-bit code unit
+    can be converted into: 1) single UTF8 byte (when it is an ASCII
+    char) or 2) two UTF8 bytes.
+
+    For this case we do only some shuffle to obtain these 2-byte
+    codes and finally compress the whole SSE register with a single
+    shuffle.
+
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
+
+    Ad 2.
+
+    When values fit in 16-bit code units, but are above 0x07ff, then
+    a single word may produce one, two or three UTF8 bytes.
+
+    We prepare data for all these three cases in two registers.
+    The first register contains lower two UTF8 bytes (used in all
+    cases), while the second one contains just the third byte for
+    the three-UTF8-bytes case.
+
+    Finally these two registers are interleaved forming eight-element
+    array of 32-bit values. The array spans two SSE registers.
+    The bytes from the registers are compressed using two shuffles.
+
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
+
+
+    To summarize:
+    - We need two 256-entry tables that have 8704 bytes in total.
+*/
+/*
+  Returns a pair: the first unprocessed byte from buf and utf8_output
+  A scalar routing should carry on the conversion of the tail.
+*/
+template <endianness big_endian>
+std::pair<const char16_t *, char32_t *>
+arm_convert_utf16_to_utf32(const char16_t *buf, size_t len,
+                           char32_t *utf32_out) {
+  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
+  const char16_t *end = buf + len;
+
+  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+
+  while (end - buf >= 8) {
+    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+    if (!match_system(big_endian)) {
+      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
+    }
+
+    const uint16x8_t surrogates_bytemask =
+        vceqq_u16(vandq_u16(in, v_f800), v_d800);
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+    // However, it is likely an uncommon occurrence.
+    if (vmaxvq_u16(surrogates_bytemask) == 0) {
+      // case: no surrogate pairs, extend all 16-bit code units to 32-bit code
+      // units
+      vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in)));
+      vst1q_u32(utf32_output + 4, vmovl_high_u16(in));
+      utf32_output += 8;
+      buf += 8;
+      // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
+        if ((word & 0xF800) != 0xD800) {
+          *utf32_output++ = char32_t(word);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char32_t *>(utf32_output));
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf32_output++ = char32_t(value);
+        }
+      }
+      buf += k;
+    }
+  } // while
+  return std::make_pair(buf, reinterpret_cast<char32_t *>(utf32_output));
+}
+
+/*
+  Returns a pair: a result struct and utf8_output.
+  If there is an error, the count field of the result is the position of the
+  error. Otherwise, it is the position of the first unprocessed byte in buf
+  (even if finished). A scalar routing should carry on the conversion of the
+  tail if needed.
+*/
+template <endianness big_endian>
+std::pair<result, char32_t *>
+arm_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
+                                       char32_t *utf32_out) {
+  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
+  const char16_t *start = buf;
+  const char16_t *end = buf + len;
+
+  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+
+  while ((end - buf) >= 8) {
+    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+    if (!match_system(big_endian)) {
+      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
+    }
+
+    const uint16x8_t surrogates_bytemask =
+        vceqq_u16(vandq_u16(in, v_f800), v_d800);
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+    // However, it is likely an uncommon occurrence.
+    if (vmaxvq_u16(surrogates_bytemask) == 0) {
+      // case: no surrogate pairs, extend all 16-bit code units to 32-bit code
+      // units
+      vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in)));
+      vst1q_u32(utf32_output + 4, vmovl_high_u16(in));
+      utf32_output += 8;
+      buf += 8;
+      // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
+        if ((word & 0xF800) != 0xD800) {
+          *utf32_output++ = char32_t(word);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k - 1),
+                reinterpret_cast<char32_t *>(utf32_output));
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf32_output++ = char32_t(value);
+        }
+      }
+      buf += k;
+    }
+  } // while
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char32_t *>(utf32_output));
+}
+/* end file src/arm64/arm_convert_utf16_to_utf32.cpp */
+/* begin file src/arm64/arm_convert_utf16_to_utf8.cpp */
+/*
+    The vectorized algorithm works on single SSE register i.e., it
+    loads eight 16-bit code units.
+
+    We consider three cases:
+    1. an input register contains no surrogates and each value
+       is in range 0x0000 .. 0x07ff.
+    2. an input register contains no surrogates and values are
+       is in range 0x0000 .. 0xffff.
+    3. an input register contains surrogates --- i.e. codepoints
+       can have 16 or 32 bits.
+
+    Ad 1.
+
+    When values are less than 0x0800, it means that a 16-bit code unit
+    can be converted into: 1) single UTF8 byte (when it is an ASCII
+    char) or 2) two UTF8 bytes.
+
+    For this case we do only some shuffle to obtain these 2-byte
+    codes and finally compress the whole SSE register with a single
+    shuffle.
+
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
+
+    Ad 2.
+
+    When values fit in 16-bit code units, but are above 0x07ff, then
+    a single word may produce one, two or three UTF8 bytes.
+
+    We prepare data for all these three cases in two registers.
+    The first register contains lower two UTF8 bytes (used in all
+    cases), while the second one contains just the third byte for
+    the three-UTF8-bytes case.
+
+    Finally these two registers are interleaved forming eight-element
+    array of 32-bit values. The array spans two SSE registers.
+    The bytes from the registers are compressed using two shuffles.
+
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
+
+
+    To summarize:
+    - We need two 256-entry tables that have 8704 bytes in total.
+*/
+/*
+  Returns a pair: the first unprocessed byte from buf and utf8_output
+  A scalar routing should carry on the conversion of the tail.
+*/
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+arm_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+  const char16_t *end = buf + len;
+
+  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+    if (!match_system(big_endian)) {
+      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
+    }
+    if (vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
+      // It is common enough that we have sequences of 16 consecutive ASCII
+      // characters.
+      uint16x8_t nextin =
+          vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
+      if (!match_system(big_endian)) {
+        nextin = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(nextin)));
+      }
+      if (vmaxvq_u16(nextin) > 0x7F) {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        uint8x8_t utf8_packed = vmovn_u16(in);
+        // 2. store (8 bytes)
+        vst1_u8(utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 8;
+        utf8_output += 8;
+        in = nextin;
+      } else {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
+        // 2. store (16 bytes)
+        vst1q_u8(utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 16;
+        utf8_output += 16;
+        continue; // we are done for this round!
+      }
+    }
+
+    if (vmaxvq_u16(in) <= 0x7FF) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+      const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+      // t0 = [000a|aaaa|bbbb|bb00]
+      const uint16x8_t t0 = vshlq_n_u16(in, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+      // t2 = [0000|0000|00bb|bbbb]
+      const uint16x8_t t2 = vandq_u16(in, v_003f);
+      // t3 = [000a|aaaa|00bb|bbbb]
+      const uint16x8_t t3 = vorrq_u16(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+      // 2. merge ASCII and 2-byte codewords
+      const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+      const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+      const uint8x16_t utf8_unpacked =
+          vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
+      // 3. prepare bitmask for 8-bit lookup
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+      const uint16x8_t mask = simdutf_make_uint16x8_t(
+          0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
+#else
+      const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
+                               0x0002, 0x0008, 0x0020, 0x0080};
+#endif
+      uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+      // 4. pack the bytes
+      const uint8_t *row =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+      const uint8x16_t shuffle = vld1q_u8(row + 1);
+      const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+
+      // 5. store bytes
+      vst1q_u8(utf8_output, utf8_packed);
+
+      // 6. adjust pointers
+      buf += 8;
+      utf8_output += row[0];
+      continue;
+    }
+    const uint16x8_t surrogates_bytemask =
+        vceqq_u16(vandq_u16(in, v_f800), v_d800);
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+    // However, it is likely an uncommon occurrence.
+    if (vmaxvq_u16(surrogates_bytemask) == 0) {
+      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+      const uint16x8_t dup_even = simdutf_make_uint16x8_t(
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+#else
+      const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
+                                   0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
+#endif
+      /* In this branch we handle three cases:
+         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+        single UFT-8 byte
+         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
+        UTF-8 bytes
+         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+        three UTF-8 bytes
+
+        We expand the input word (16-bit) into two code units (32-bit), thus
+        we have room for four bytes. However, we need five distinct bit
+        layouts. Note that the last byte in cases #2 and #3 is the same.
+
+        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+        in register t2.
+
+        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+        either byte 1 for case #2 or byte 2 for case #3. Note that they
+        differ by exactly one bit.
+
+        Finally from these two code units we build proper UTF-8 sequence, taking
+        into account the case (i.e, the number of bytes to write).
+      */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
+#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      const uint16x8_t t0 = vreinterpretq_u16_u8(
+          vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+      const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+
+      // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+      const uint16x8_t s0 = vshrq_n_u16(in, 12);
+      // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+      const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
+      // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+      const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+      // [00bb|bbbb|0000|aaaa]
+      const uint16x8_t s2 = vorrq_u16(s0, s1s);
+      // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+      const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+      const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
+      const uint16x8_t m0 =
+          vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
+      const uint16x8_t s4 = veorq_u16(s3, m0);
+#undef simdutf_vec
+
+      // 4. expand code units 16-bit => 32-bit
+      const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+      const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+      const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+      const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+      const uint16x8_t onemask = simdutf_make_uint16x8_t(
+          0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
+      const uint16x8_t twomask = simdutf_make_uint16x8_t(
+          0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
+#else
+      const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
+                                  0x0100, 0x0400, 0x1000, 0x4000};
+      const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
+                                  0x0200, 0x0800, 0x2000, 0x8000};
+#endif
+      const uint16x8_t combined =
+          vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
+                    vandq_u16(one_or_two_bytes_bytemask, twomask));
+      const uint16_t mask = vaddvq_u16(combined);
+      // The following fast path may or may not be beneficial.
+      /*if(mask == 0) {
+        // We only have three-byte code units. Use fast path.
+        const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+        vst1q_u8(utf8_output, utf8_0);
+        utf8_output += 12;
+        vst1q_u8(utf8_output, utf8_1);
+        utf8_output += 12;
+        buf += 8;
+        continue;
+      }*/
+      const uint8_t mask0 = uint8_t(mask);
+
+      const uint8_t *row0 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+      const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+      const uint8_t *row1 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+      const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+      vst1q_u8(utf8_output, utf8_0);
+      utf8_output += row0[0];
+      vst1q_u8(utf8_output, utf8_1);
+      utf8_output += row1[0];
+
+      buf += 8;
+      // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
+        if ((word & 0xFF80) == 0) {
+          *utf8_output++ = char(word);
+        } else if ((word & 0xF800) == 0) {
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xF800) != 0xD800) {
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char *>(utf8_output));
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf8_output++ = char((value >> 18) | 0b11110000);
+          *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
+    }
+  } // while
+
+  return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
+}
+
+/*
+  Returns a pair: a result struct and utf8_output.
+  If there is an error, the count field of the result is the position of the
+  error. Otherwise, it is the position of the first unprocessed byte in buf
+  (even if finished). A scalar routing should carry on the conversion of the
+  tail if needed.
+*/
+template <endianness big_endian>
+std::pair<result, char *>
+arm_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
+                                      char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+  const char16_t *start = buf;
+  const char16_t *end = buf + len;
+
+  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+    if (!match_system(big_endian)) {
+      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
+    }
+    if (vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
+      // It is common enough that we have sequences of 16 consecutive ASCII
+      // characters.
+      uint16x8_t nextin =
+          vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
+      if (!match_system(big_endian)) {
+        nextin = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(nextin)));
+      }
+      if (vmaxvq_u16(nextin) > 0x7F) {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        uint8x8_t utf8_packed = vmovn_u16(in);
+        // 2. store (8 bytes)
+        vst1_u8(utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 8;
+        utf8_output += 8;
+        in = nextin;
+      } else {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
+        // 2. store (16 bytes)
+        vst1q_u8(utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 16;
+        utf8_output += 16;
+        continue; // we are done for this round!
+      }
+    }
+
+    if (vmaxvq_u16(in) <= 0x7FF) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+      const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+      // t0 = [000a|aaaa|bbbb|bb00]
+      const uint16x8_t t0 = vshlq_n_u16(in, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+      // t2 = [0000|0000|00bb|bbbb]
+      const uint16x8_t t2 = vandq_u16(in, v_003f);
+      // t3 = [000a|aaaa|00bb|bbbb]
+      const uint16x8_t t3 = vorrq_u16(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+      // 2. merge ASCII and 2-byte codewords
+      const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+      const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+      const uint8x16_t utf8_unpacked =
+          vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
+      // 3. prepare bitmask for 8-bit lookup
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+      const uint16x8_t mask = simdutf_make_uint16x8_t(
+          0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
+#else
+      const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
+                               0x0002, 0x0008, 0x0020, 0x0080};
+#endif
+      uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+      // 4. pack the bytes
+      const uint8_t *row =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+      const uint8x16_t shuffle = vld1q_u8(row + 1);
+      const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+
+      // 5. store bytes
+      vst1q_u8(utf8_output, utf8_packed);
+
+      // 6. adjust pointers
+      buf += 8;
+      utf8_output += row[0];
+      continue;
+    }
+    const uint16x8_t surrogates_bytemask =
+        vceqq_u16(vandq_u16(in, v_f800), v_d800);
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+    // However, it is likely an uncommon occurrence.
+    if (vmaxvq_u16(surrogates_bytemask) == 0) {
+      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+      const uint16x8_t dup_even = simdutf_make_uint16x8_t(
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+#else
+      const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
+                                   0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
+#endif
+      /* In this branch we handle three cases:
+         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+        single UFT-8 byte
+         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
+        UTF-8 bytes
+         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+        three UTF-8 bytes
+
+        We expand the input word (16-bit) into two code units (32-bit), thus
+        we have room for four bytes. However, we need five distinct bit
+        layouts. Note that the last byte in cases #2 and #3 is the same.
+
+        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+        in register t2.
+
+        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+        either byte 1 for case #2 or byte 2 for case #3. Note that they
+        differ by exactly one bit.
+
+        Finally from these two code units we build proper UTF-8 sequence, taking
+        into account the case (i.e, the number of bytes to write).
+      */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
+#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      const uint16x8_t t0 = vreinterpretq_u16_u8(
+          vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+      const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+
+      // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+      const uint16x8_t s0 = vshrq_n_u16(in, 12);
+      // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+      const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
+      // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+      const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+      // [00bb|bbbb|0000|aaaa]
+      const uint16x8_t s2 = vorrq_u16(s0, s1s);
+      // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+      const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+      const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
+      const uint16x8_t m0 =
+          vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
+      const uint16x8_t s4 = veorq_u16(s3, m0);
+#undef simdutf_vec
+
+      // 4. expand code units 16-bit => 32-bit
+      const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+      const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+      const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+      const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+      const uint16x8_t onemask = simdutf_make_uint16x8_t(
+          0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
+      const uint16x8_t twomask = simdutf_make_uint16x8_t(
+          0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
+#else
+      const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
+                                  0x0100, 0x0400, 0x1000, 0x4000};
+      const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
+                                  0x0200, 0x0800, 0x2000, 0x8000};
+#endif
+      const uint16x8_t combined =
+          vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
+                    vandq_u16(one_or_two_bytes_bytemask, twomask));
+      const uint16_t mask = vaddvq_u16(combined);
+      // The following fast path may or may not be beneficial.
+      /*if(mask == 0) {
+        // We only have three-byte code units. Use fast path.
+        const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+        vst1q_u8(utf8_output, utf8_0);
+        utf8_output += 12;
+        vst1q_u8(utf8_output, utf8_1);
+        utf8_output += 12;
+        buf += 8;
+        continue;
+      }*/
+      const uint8_t mask0 = uint8_t(mask);
+
+      const uint8_t *row0 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+      const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+      const uint8_t *row1 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+      const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+      vst1q_u8(utf8_output, utf8_0);
+      utf8_output += row0[0];
+      vst1q_u8(utf8_output, utf8_1);
+      utf8_output += row1[0];
+
+      buf += 8;
+      // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
+        if ((word & 0xFF80) == 0) {
+          *utf8_output++ = char(word);
+        } else if ((word & 0xF800) == 0) {
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xF800) != 0xD800) {
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k - 1),
+                reinterpret_cast<char *>(utf8_output));
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf8_output++ = char((value >> 18) | 0b11110000);
+          *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
+    }
+  } // while
+
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char *>(utf8_output));
+}
+/* end file src/arm64/arm_convert_utf16_to_utf8.cpp */
+
+/* begin file src/arm64/arm_base64.cpp */
+/**
+ * References and further reading:
+ *
+ * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
+ * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
+ * https://arxiv.org/abs/1910.05109
+ *
+ * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
+ * Instructions, ACM Transactions on the Web 12 (3), 2018.
+ * https://arxiv.org/abs/1704.00605
+ *
+ * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
+ * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
+ * Request for Comments: 4648.
+ *
+ * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
+ * http://www.alfredklomp.com/programming/sse-base64/. (2014).
+ *
+ * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
+ * acceleration. https://github.com/aklomp/base64. (2014).
+ *
+ * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
+ * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
+ *
+ * Nick Kopp. 2013. Base64 Encoding on a GPU.
+ * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
+ */
+
+size_t encode_base64(char *dst, const char *src, size_t srclen,
+                     base64_options options) {
+  // credit: Wojciech Muła
+  uint8_t *out = (uint8_t *)dst;
+  constexpr static uint8_t source_table[64] = {
+      'A', 'Q', 'g', 'w', 'B', 'R', 'h', 'x', 'C', 'S', 'i', 'y', 'D',
+      'T', 'j', 'z', 'E', 'U', 'k', '0', 'F', 'V', 'l', '1', 'G', 'W',
+      'm', '2', 'H', 'X', 'n', '3', 'I', 'Y', 'o', '4', 'J', 'Z', 'p',
+      '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8',
+      'N', 'd', 't', '9', 'O', 'e', 'u', '+', 'P', 'f', 'v', '/',
+  };
+  constexpr static uint8_t source_table_url[64] = {
+      'A', 'Q', 'g', 'w', 'B', 'R', 'h', 'x', 'C', 'S', 'i', 'y', 'D',
+      'T', 'j', 'z', 'E', 'U', 'k', '0', 'F', 'V', 'l', '1', 'G', 'W',
+      'm', '2', 'H', 'X', 'n', '3', 'I', 'Y', 'o', '4', 'J', 'Z', 'p',
+      '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8',
+      'N', 'd', 't', '9', 'O', 'e', 'u', '-', 'P', 'f', 'v', '_',
+  };
+  const uint8x16_t v3f = vdupq_n_u8(0x3f);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  // When trying to load a uint8_t array, Visual Studio might
+  // error with: error C2664: '__n128x4 neon_ld4m_q8(const char *)':
+  // cannot convert argument 1 from 'const uint8_t [64]' to 'const char *
+  const uint8x16x4_t table = vld4q_u8(
+      (reinterpret_cast<const char *>(options & base64_url) ? source_table_url
+                                                            : source_table));
+#else
+  const uint8x16x4_t table =
+      vld4q_u8((options & base64_url) ? source_table_url : source_table);
+#endif
+  size_t i = 0;
+  for (; i + 16 * 3 <= srclen; i += 16 * 3) {
+    const uint8x16x3_t in = vld3q_u8((const uint8_t *)src + i);
+    uint8x16x4_t result;
+    result.val[0] = vshrq_n_u8(in.val[0], 2);
+    result.val[1] =
+        vandq_u8(vsliq_n_u8(vshrq_n_u8(in.val[1], 4), in.val[0], 4), v3f);
+    result.val[2] =
+        vandq_u8(vsliq_n_u8(vshrq_n_u8(in.val[2], 6), in.val[1], 2), v3f);
+    result.val[3] = vandq_u8(in.val[2], v3f);
+    result.val[0] = vqtbl4q_u8(table, result.val[0]);
+    result.val[1] = vqtbl4q_u8(table, result.val[1]);
+    result.val[2] = vqtbl4q_u8(table, result.val[2]);
+    result.val[3] = vqtbl4q_u8(table, result.val[3]);
+    vst4q_u8(out, result);
+    out += 64;
+  }
+  out += scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i,
+                                            options);
+
+  return size_t((char *)out - dst);
+}
+
+static inline void compress(uint8x16_t data, uint16_t mask, char *output) {
+  if (mask == 0) {
+    vst1q_u8((uint8_t *)output, data);
+    return;
+  }
+  uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
+  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
+  uint64x2_t compactmasku64 = {tables::base64::thintable_epi8[mask1],
+                               tables::base64::thintable_epi8[mask2]};
+  uint8x16_t compactmask = vreinterpretq_u8_u64(compactmasku64);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  const uint8x16_t off =
+      simdutf_make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8);
+#else
+  const uint8x16_t off = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
+#endif
+
+  compactmask = vaddq_u8(compactmask, off);
+  uint8x16_t pruned = vqtbl1q_u8(data, compactmask);
+
+  int pop1 = tables::base64::BitsSetTable256mul2[mask1];
+  // then load the corresponding mask, what it does is to write
+  // only the first pop1 bytes from the first 8 bytes, and then
+  // it fills in with the bytes from the second 8 bytes + some filling
+  // at the end.
+  compactmask = vld1q_u8(tables::base64::pshufb_combine_table + pop1 * 8);
+  uint8x16_t answer = vqtbl1q_u8(pruned, compactmask);
+  vst1q_u8((uint8_t *)output, answer);
+}
+
+struct block64 {
+  uint8x16_t chunks[4];
+};
+
+static_assert(sizeof(block64) == 64, "block64 is not 64 bytes");
+template <bool base64_url> uint64_t to_base64_mask(block64 *b, bool *error) {
+  uint8x16_t v0f = vdupq_n_u8(0xf);
+
+  uint8x16_t underscore0, underscore1, underscore2, underscore3;
+  if (base64_url) {
+    underscore0 = vceqq_u8(b->chunks[0], vdupq_n_u8(0x5f));
+    underscore1 = vceqq_u8(b->chunks[1], vdupq_n_u8(0x5f));
+    underscore2 = vceqq_u8(b->chunks[2], vdupq_n_u8(0x5f));
+    underscore3 = vceqq_u8(b->chunks[3], vdupq_n_u8(0x5f));
+  } else {
+    (void)underscore0;
+    (void)underscore1;
+    (void)underscore2;
+    (void)underscore3;
+  }
+
+  uint8x16_t lo_nibbles0 = vandq_u8(b->chunks[0], v0f);
+  uint8x16_t lo_nibbles1 = vandq_u8(b->chunks[1], v0f);
+  uint8x16_t lo_nibbles2 = vandq_u8(b->chunks[2], v0f);
+  uint8x16_t lo_nibbles3 = vandq_u8(b->chunks[3], v0f);
+
+  // Needed by the decoding step.
+  uint8x16_t hi_nibbles0 = vshrq_n_u8(b->chunks[0], 4);
+  uint8x16_t hi_nibbles1 = vshrq_n_u8(b->chunks[1], 4);
+  uint8x16_t hi_nibbles2 = vshrq_n_u8(b->chunks[2], 4);
+  uint8x16_t hi_nibbles3 = vshrq_n_u8(b->chunks[3], 4);
+  uint8x16_t lut_lo;
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  if (base64_url) {
+    lut_lo =
+        simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+                                0x70, 0x61, 0xe1, 0xf4, 0xe5, 0xa5, 0xf4, 0xf4);
+  } else {
+    lut_lo =
+        simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+                                0x70, 0x61, 0xe1, 0xb4, 0xe5, 0xe5, 0xf4, 0xb4);
+  }
+#else
+  if (base64_url) {
+    lut_lo = uint8x16_t{0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+                        0x70, 0x61, 0xe1, 0xf4, 0xe5, 0xa5, 0xf4, 0xf4};
+  } else {
+    lut_lo = uint8x16_t{0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+                        0x70, 0x61, 0xe1, 0xb4, 0xe5, 0xe5, 0xf4, 0xb4};
+  }
+#endif
+  uint8x16_t lo0 = vqtbl1q_u8(lut_lo, lo_nibbles0);
+  uint8x16_t lo1 = vqtbl1q_u8(lut_lo, lo_nibbles1);
+  uint8x16_t lo2 = vqtbl1q_u8(lut_lo, lo_nibbles2);
+  uint8x16_t lo3 = vqtbl1q_u8(lut_lo, lo_nibbles3);
+  uint8x16_t lut_hi;
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  if (base64_url) {
+    lut_hi =
+        simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4,
+                                0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20);
+  } else {
+    lut_hi =
+        simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4,
+                                0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20);
+  }
+#else
+  if (base64_url) {
+    lut_hi = uint8x16_t{0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4,
+                        0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
+  } else {
+    lut_hi = uint8x16_t{0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4,
+                        0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
+  }
+#endif
+  uint8x16_t hi0 = vqtbl1q_u8(lut_hi, hi_nibbles0);
+  uint8x16_t hi1 = vqtbl1q_u8(lut_hi, hi_nibbles1);
+  uint8x16_t hi2 = vqtbl1q_u8(lut_hi, hi_nibbles2);
+  uint8x16_t hi3 = vqtbl1q_u8(lut_hi, hi_nibbles3);
+
+  if (base64_url) {
+    hi0 = vbicq_u8(hi0, underscore0);
+    hi1 = vbicq_u8(hi1, underscore1);
+    hi2 = vbicq_u8(hi2, underscore2);
+    hi3 = vbicq_u8(hi3, underscore3);
+  }
+
+  uint8_t checks =
+      vmaxvq_u8(vorrq_u8(vorrq_u8(vandq_u8(lo0, hi0), vandq_u8(lo1, hi1)),
+                         vorrq_u8(vandq_u8(lo2, hi2), vandq_u8(lo3, hi3))));
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  const uint8x16_t bit_mask =
+      simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                              0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+#else
+  const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+#endif
+  uint64_t badcharmask = 0;
+  *error = checks > 0x3;
+  if (checks) {
+    // Add each of the elements next to each other, successively, to stuff each
+    // 8 byte mask into one.
+    uint8x16_t test0 = vtstq_u8(lo0, hi0);
+    uint8x16_t test1 = vtstq_u8(lo1, hi1);
+    uint8x16_t test2 = vtstq_u8(lo2, hi2);
+    uint8x16_t test3 = vtstq_u8(lo3, hi3);
+    uint8x16_t sum0 =
+        vpaddq_u8(vandq_u8(test0, bit_mask), vandq_u8(test1, bit_mask));
+    uint8x16_t sum1 =
+        vpaddq_u8(vandq_u8(test2, bit_mask), vandq_u8(test3, bit_mask));
+    sum0 = vpaddq_u8(sum0, sum1);
+    sum0 = vpaddq_u8(sum0, sum0);
+    badcharmask = vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+  }
+  // This is the transformation step that can be done while we are waiting for
+  // sum0
+  uint8x16_t roll_lut;
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  if (base64_url) {
+    roll_lut =
+        simdutf_make_uint8x16_t(0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+                                0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+  } else {
+    roll_lut =
+        simdutf_make_uint8x16_t(0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+                                0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+  }
+#else
+  if (base64_url) {
+    roll_lut = uint8x16_t{0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+                          0x0,  0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0};
+  } else {
+    roll_lut = uint8x16_t{0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+                          0x0, 0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0};
+  }
+#endif
+  uint8x16_t vsecond_last = base64_url ? vdupq_n_u8(0x2d) : vdupq_n_u8(0x2f);
+  if (base64_url) {
+    hi_nibbles0 = vbicq_u8(hi_nibbles0, underscore0);
+    hi_nibbles1 = vbicq_u8(hi_nibbles1, underscore1);
+    hi_nibbles2 = vbicq_u8(hi_nibbles2, underscore2);
+    hi_nibbles3 = vbicq_u8(hi_nibbles3, underscore3);
+  }
+  uint8x16_t roll0 = vqtbl1q_u8(
+      roll_lut, vaddq_u8(vceqq_u8(b->chunks[0], vsecond_last), hi_nibbles0));
+  uint8x16_t roll1 = vqtbl1q_u8(
+      roll_lut, vaddq_u8(vceqq_u8(b->chunks[1], vsecond_last), hi_nibbles1));
+  uint8x16_t roll2 = vqtbl1q_u8(
+      roll_lut, vaddq_u8(vceqq_u8(b->chunks[2], vsecond_last), hi_nibbles2));
+  uint8x16_t roll3 = vqtbl1q_u8(
+      roll_lut, vaddq_u8(vceqq_u8(b->chunks[3], vsecond_last), hi_nibbles3));
+  b->chunks[0] = vaddq_u8(b->chunks[0], roll0);
+  b->chunks[1] = vaddq_u8(b->chunks[1], roll1);
+  b->chunks[2] = vaddq_u8(b->chunks[2], roll2);
+  b->chunks[3] = vaddq_u8(b->chunks[3], roll3);
+  return badcharmask;
+}
+
+void copy_block(block64 *b, char *output) {
+  vst1q_u8((uint8_t *)output, b->chunks[0]);
+  vst1q_u8((uint8_t *)output + 16, b->chunks[1]);
+  vst1q_u8((uint8_t *)output + 32, b->chunks[2]);
+  vst1q_u8((uint8_t *)output + 48, b->chunks[3]);
+}
+
+uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
+  uint64_t popcounts =
+      vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0);
+  uint64_t offsets = popcounts * 0x0101010101010101;
+  compress(b->chunks[0], uint16_t(mask), output);
+  compress(b->chunks[1], uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF]);
+  compress(b->chunks[2], uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF]);
+  compress(b->chunks[3], uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF]);
+  return offsets >> 56;
+}
+
+// The caller of this function is responsible to ensure that there are 64 bytes
+// available from reading at src. The data is read into a block64 structure.
+void load_block(block64 *b, const char *src) {
+  b->chunks[0] = vld1q_u8(reinterpret_cast<const uint8_t *>(src));
+  b->chunks[1] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 16);
+  b->chunks[2] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 32);
+  b->chunks[3] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 48);
+}
+
+// The caller of this function is responsible to ensure that there are 32 bytes
+// available from reading at data. It returns a 16-byte value, narrowing with
+// saturation the 16-bit words.
+inline uint8x16_t load_satured(const uint16_t *data) {
+  uint16x8_t in1 = vld1q_u16(data);
+  uint16x8_t in2 = vld1q_u16(data + 8);
+  return vqmovn_high_u16(vqmovn_u16(in1), in2);
+}
+
+// The caller of this function is responsible to ensure that there are 128 bytes
+// available from reading at src. The data is read into a block64 structure.
+void load_block(block64 *b, const char16_t *src) {
+  b->chunks[0] = load_satured(reinterpret_cast<const uint16_t *>(src));
+  b->chunks[1] = load_satured(reinterpret_cast<const uint16_t *>(src) + 16);
+  b->chunks[2] = load_satured(reinterpret_cast<const uint16_t *>(src) + 32);
+  b->chunks[3] = load_satured(reinterpret_cast<const uint16_t *>(src) + 48);
+}
+
+// decode 64 bytes and output 48 bytes
+void base64_decode_block(char *out, const char *src) {
+  uint8x16x4_t str = vld4q_u8((uint8_t *)src);
+  uint8x16x3_t outvec;
+  outvec.val[0] =
+      vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
+  outvec.val[1] =
+      vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
+  outvec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
+  vst3q_u8((uint8_t *)out, outvec);
+}
+
+template <bool base64_url, typename char_type>
+full_result
+compress_decode_base64(char *dst, const char_type *src, size_t srclen,
+                       base64_options options,
+                       last_chunk_handling_options last_chunk_options) {
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+                                        : tables::base64::to_base64_value;
+  size_t equallocation =
+      srclen; // location of the first padding character if any
+  // skip trailing spaces
+  while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+         to_base64[uint8_t(src[srclen - 1])] == 64) {
+    srclen--;
+  }
+  size_t equalsigns = 0;
+  if (srclen > 0 && src[srclen - 1] == '=') {
+    equallocation = srclen - 1;
+    srclen--;
+    equalsigns = 1;
+    // skip trailing spaces
+    while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+           to_base64[uint8_t(src[srclen - 1])] == 64) {
+      srclen--;
+    }
+    if (srclen > 0 && src[srclen - 1] == '=') {
+      equallocation = srclen - 1;
+      srclen--;
+      equalsigns = 2;
+    }
+  }
+  if (srclen == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation, 0};
+    }
+    return {SUCCESS, 0, 0};
+  }
+  const char_type *const srcinit = src;
+  const char *const dstinit = dst;
+  const char_type *const srcend = src + srclen;
+
+  constexpr size_t block_size = 10;
+  char buffer[block_size * 64];
+  char *bufferptr = buffer;
+  if (srclen >= 64) {
+    const char_type *const srcend64 = src + srclen - 64;
+    while (src <= srcend64) {
+      block64 b;
+      load_block(&b, src);
+      src += 64;
+      bool error = false;
+      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
+      if (badcharmask) {
+        if (error) {
+          src -= 64;
+          while (src < srcend && scalar::base64::is_eight_byte(*src) &&
+                 to_base64[uint8_t(*src)] <= 64) {
+            src++;
+          }
+          if (src < srcend) {
+            // should never happen
+          }
+          return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+                  size_t(dst - dstinit)};
+        }
+      }
+
+      if (badcharmask != 0) {
+        // optimization opportunity: check for simple masks like those made of
+        // continuous 1s followed by continuous 0s. And masks containing a
+        // single bad character.
+        bufferptr += compress_block(&b, badcharmask, bufferptr);
+      } else {
+        // optimization opportunity: if bufferptr == buffer and mask == 0, we
+        // can avoid the call to compress_block and decode directly.
+        copy_block(&b, bufferptr);
+        bufferptr += 64;
+      }
+      if (bufferptr >= (block_size - 1) * 64 + buffer) {
+        for (size_t i = 0; i < (block_size - 1); i++) {
+          base64_decode_block(dst, buffer + i * 64);
+          dst += 48;
+        }
+        std::memcpy(buffer, buffer + (block_size - 1) * 64,
+                    64); // 64 might be too much
+        bufferptr -= (block_size - 1) * 64;
+      }
+    }
+  }
+  char *buffer_start = buffer;
+  // Optimization note: if this is almost full, then it is worth our
+  // time, otherwise, we should just decode directly.
+  int last_block = (int)((bufferptr - buffer_start) % 64);
+  if (last_block != 0 && srcend - src + last_block >= 64) {
+    while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
+      uint8_t val = to_base64[uint8_t(*src)];
+      *bufferptr = char(val);
+      if (!scalar::base64::is_eight_byte(*src) || val > 64) {
+        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+                size_t(dst - dstinit)};
+      }
+      bufferptr += (val <= 63);
+      src++;
+    }
+  }
+
+  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
+    base64_decode_block(dst, buffer_start);
+    dst += 48;
+  }
+  if ((bufferptr - buffer_start) % 64 != 0) {
+    while (buffer_start + 4 < bufferptr) {
+      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                        << 8;
+      triple = scalar::utf32::swap_bytes(triple);
+      std::memcpy(dst, &triple, 4);
+
+      dst += 3;
+      buffer_start += 4;
+    }
+    if (buffer_start + 4 <= bufferptr) {
+      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                        << 8;
+      triple = scalar::utf32::swap_bytes(triple);
+      std::memcpy(dst, &triple, 3);
+
+      dst += 3;
+      buffer_start += 4;
+    }
+    // we may have 1, 2 or 3 bytes left and we need to decode them so let us
+    // backtrack
+    int leftover = int(bufferptr - buffer_start);
+    while (leftover > 0) {
+      while (to_base64[uint8_t(*(src - 1))] == 64) {
+        src--;
+      }
+      src--;
+      leftover--;
+    }
+  }
+  if (src < srcend + equalsigns) {
+    full_result r = scalar::base64::base64_tail_decode(
+        dst, src, srcend - src, equalsigns, options, last_chunk_options);
+    r.input_count += size_t(src - srcinit);
+    if (r.error == error_code::INVALID_BASE64_CHARACTER ||
+        r.error == error_code::BASE64_EXTRA_BITS) {
+      return r;
+    } else {
+      r.output_count += size_t(dst - dstinit);
+    }
+    if (last_chunk_options != stop_before_partial &&
+        r.error == error_code::SUCCESS && equalsigns > 0) {
+      // additional checks
+      if ((r.output_count % 3 == 0) ||
+          ((r.output_count % 3) + 1 + equalsigns != 4)) {
+        r.error = error_code::INVALID_BASE64_CHARACTER;
+        r.input_count = equallocation;
+      }
+    }
+    return r;
+  }
+  if (equalsigns > 0) {
+    if ((size_t(dst - dstinit) % 3 == 0) ||
+        ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
+    }
+  }
+  return {SUCCESS, srclen, size_t(dst - dstinit)};
+}
+/* end file src/arm64/arm_base64.cpp */
+/* begin file src/arm64/arm_convert_utf32_to_latin1.cpp */
+std::pair<const char32_t *, char *>
+arm_convert_utf32_to_latin1(const char32_t *buf, size_t len,
+                            char *latin1_output) {
+  const char32_t *end = buf + len;
+  while (end - buf >= 8) {
+    uint32x4_t in1 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+    uint32x4_t in2 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
+
+    uint16x8_t utf16_packed = vcombine_u16(vqmovn_u32(in1), vqmovn_u32(in2));
+    if (vmaxvq_u16(utf16_packed) <= 0xff) {
+      // 1. pack the bytes
+      uint8x8_t latin1_packed = vmovn_u16(utf16_packed);
+      // 2. store (8 bytes)
+      vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
+      // 3. adjust pointers
+      buf += 8;
+      latin1_output += 8;
+    } else {
+      return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
+    }
+  } // while
+  return std::make_pair(buf, latin1_output);
+}
+
+std::pair<result, char *>
+arm_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+                                        char *latin1_output) {
+  const char32_t *start = buf;
+  const char32_t *end = buf + len;
+
+  while (end - buf >= 8) {
+    uint32x4_t in1 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+    uint32x4_t in2 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
+
+    uint16x8_t utf16_packed = vcombine_u16(vqmovn_u32(in1), vqmovn_u32(in2));
+
+    if (vmaxvq_u16(utf16_packed) <= 0xff) {
+      // 1. pack the bytes
+      uint8x8_t latin1_packed = vmovn_u16(utf16_packed);
+      // 2. store (8 bytes)
+      vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
+      // 3. adjust pointers
+      buf += 8;
+      latin1_output += 8;
+    } else {
+      // Let us do a scalar fallback.
+      for (int k = 0; k < 8; k++) {
+        uint32_t word = buf[k];
+        if (word <= 0xff) {
+          *latin1_output++ = char(word);
+        } else {
+          return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
+                                latin1_output);
+        }
+      }
+    }
+  } // while
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        latin1_output);
+}
+/* end file src/arm64/arm_convert_utf32_to_latin1.cpp */
+/* begin file src/arm64/arm_convert_utf32_to_utf16.cpp */
+template <endianness big_endian>
+std::pair<const char32_t *, char16_t *>
+arm_convert_utf32_to_utf16(const char32_t *buf, size_t len,
+                           char16_t *utf16_out) {
+  uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
+  const char32_t *end = buf + len;
+
+  uint16x4_t forbidden_bytemask = vmov_n_u16(0x0);
+
+  while (end - buf >= 4) {
+    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+
+    // Check if no bits set above 16th
+    if (vmaxvq_u32(in) <= 0xFFFF) {
+      uint16x4_t utf16_packed = vmovn_u32(in);
+
+      const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
+      const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
+      forbidden_bytemask = vorr_u16(vand_u16(vcle_u16(utf16_packed, v_dfff),
+                                             vcge_u16(utf16_packed, v_d800)),
+                                    forbidden_bytemask);
+
+      if (!match_system(big_endian)) {
+        utf16_packed =
+            vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(utf16_packed)));
+      }
+      vst1_u16(utf16_output, utf16_packed);
+      utf16_output += 4;
+      buf += 4;
+    } else {
+      size_t forward = 3;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFF0000) == 0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char16_t *>(utf16_output));
+          }
+          *utf16_output++ = !match_system(big_endian)
+                                ? char16_t(word >> 8 | word << 8)
+                                : char16_t(word);
+        } else {
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char16_t *>(utf16_output));
+          }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (!match_system(big_endian)) {
+            high_surrogate =
+                uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+            low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
+        }
+      }
+      buf += k;
+    }
+  }
+
+  // check for invalid input
+  if (vmaxv_u16(forbidden_bytemask) != 0) {
+    return std::make_pair(nullptr, reinterpret_cast<char16_t *>(utf16_output));
+  }
+
+  return std::make_pair(buf, reinterpret_cast<char16_t *>(utf16_output));
+}
+
+template <endianness big_endian>
+std::pair<result, char16_t *>
+arm_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
+                                       char16_t *utf16_out) {
+  uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
+  const char32_t *start = buf;
+  const char32_t *end = buf + len;
+
+  while (end - buf >= 4) {
+    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+
+    // Check if no bits set above 16th
+    if (vmaxvq_u32(in) <= 0xFFFF) {
+      uint16x4_t utf16_packed = vmovn_u32(in);
+
+      const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
+      const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
+      const uint16x4_t forbidden_bytemask = vand_u16(
+          vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800));
+      if (vmaxv_u16(forbidden_bytemask) != 0) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start),
+                              reinterpret_cast<char16_t *>(utf16_output));
+      }
+
+      if (!match_system(big_endian)) {
+        utf16_packed =
+            vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(utf16_packed)));
+      }
+      vst1_u16(utf16_output, utf16_packed);
+      utf16_output += 4;
+      buf += 4;
+    } else {
+      size_t forward = 3;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFF0000) == 0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k),
+                reinterpret_cast<char16_t *>(utf16_output));
+          }
+          *utf16_output++ = !match_system(big_endian)
+                                ? char16_t(word >> 8 | word << 8)
+                                : char16_t(word);
+        } else {
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) {
+            return std::make_pair(
+                result(error_code::TOO_LARGE, buf - start + k),
+                reinterpret_cast<char16_t *>(utf16_output));
+          }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (!match_system(big_endian)) {
+            high_surrogate =
+                uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+            low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
+        }
+      }
+      buf += k;
+    }
+  }
+
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char16_t *>(utf16_output));
+}
+/* end file src/arm64/arm_convert_utf32_to_utf16.cpp */
+/* begin file src/arm64/arm_convert_utf32_to_utf8.cpp */
+std::pair<const char32_t *, char *>
+arm_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+  const char32_t *end = buf + len;
+
+  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+
+  uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0);
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+
+  while (buf + 16 + safety_margin < end) {
+    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+    uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
+
+    // Check if no bits set above 16th
+    if (vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
+      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+      // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
+      uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
+      if (vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
+        // 1. pack the bytes
+        // obviously suboptimal.
+        uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
+        // 2. store (8 bytes)
+        vst1_u8(utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 8;
+        utf8_output += 8;
+        continue; // we are done for this round!
+      }
+
+      if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
+        // 1. prepare 2-byte values
+        // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+        // expected output   : [110a|aaaa|10bb|bbbb] x 8
+        const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+        const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+        // t0 = [000a|aaaa|bbbb|bb00]
+        const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
+        // t1 = [000a|aaaa|0000|0000]
+        const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+        // t2 = [0000|0000|00bb|bbbb]
+        const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
+        // t3 = [000a|aaaa|00bb|bbbb]
+        const uint16x8_t t3 = vorrq_u16(t1, t2);
+        // t4 = [110a|aaaa|10bb|bbbb]
+        const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+        // 2. merge ASCII and 2-byte codewords
+        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+        const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+        const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(
+            vbslq_u16(one_byte_bytemask, utf16_packed, t4));
+        // 3. prepare bitmask for 8-bit lookup
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+        const uint16x8_t mask = simdutf_make_uint16x8_t(
+            0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
+#else
+        const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
+                                 0x0002, 0x0008, 0x0020, 0x0080};
+#endif
+        uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+        // 4. pack the bytes
+        const uint8_t *row =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+        const uint8x16_t shuffle = vld1q_u8(row + 1);
+        const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+
+        // 5. store bytes
+        vst1q_u8(utf8_output, utf8_packed);
+
+        // 6. adjust pointers
+        buf += 8;
+        utf8_output += row[0];
+        continue;
+      } else {
+        // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+        const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+        const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
+        forbidden_bytemask =
+            vorrq_u16(vandq_u16(vcleq_u16(utf16_packed, v_dfff),
+                                vcgeq_u16(utf16_packed, v_d800)),
+                      forbidden_bytemask);
+
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+        const uint16x8_t dup_even = simdutf_make_uint16x8_t(
+            0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+#else
+        const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
+                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
+#endif
+        /* In this branch we handle three cases:
+          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+          single UFT-8 byte
+          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
+          two UTF-8 bytes
+          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+          three UTF-8 bytes
+
+          We expand the input word (16-bit) into two code units (32-bit), thus
+          we have room for four bytes. However, we need five distinct bit
+          layouts. Note that the last byte in cases #2 and #3 is the same.
+
+          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+          in register t2.
+
+          We precompute byte 1 for case #3 and -- **conditionally** --
+          precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+          they differ by exactly one bit.
+
+          Finally from these two code units we build proper UTF-8 sequence,
+          taking into account the case (i.e, the number of bytes to write).
+        */
+        /**
+         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+         * t2 => [0ccc|cccc] [10cc|cccc]
+         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+         */
+#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+        const uint16x8_t t0 =
+            vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed),
+                                            vreinterpretq_u8_u16(dup_even)));
+        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+        const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+        const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+
+        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+        const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
+        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+        const uint16x8_t s1 =
+            vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
+        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+        const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+        // [00bb|bbbb|0000|aaaa]
+        const uint16x8_t s2 = vorrq_u16(s0, s1s);
+        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+        const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+        const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+        const uint16x8_t one_or_two_bytes_bytemask =
+            vcleq_u16(utf16_packed, v_07ff);
+        const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000),
+                                        one_or_two_bytes_bytemask);
+        const uint16x8_t s4 = veorq_u16(s3, m0);
+#undef simdutf_vec
+
+        // 4. expand code units 16-bit => 32-bit
+        const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+        const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+        // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+        const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+        const uint16x8_t onemask = simdutf_make_uint16x8_t(
+            0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
+        const uint16x8_t twomask = simdutf_make_uint16x8_t(
+            0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
+#else
+        const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
+                                    0x0100, 0x0400, 0x1000, 0x4000};
+        const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
+                                    0x0200, 0x0800, 0x2000, 0x8000};
+#endif
+        const uint16x8_t combined =
+            vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
+                      vandq_u16(one_or_two_bytes_bytemask, twomask));
+        const uint16_t mask = vaddvq_u16(combined);
+        // The following fast path may or may not be beneficial.
+        /*if(mask == 0) {
+          // We only have three-byte code units. Use fast path.
+          const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+          vst1q_u8(utf8_output, utf8_0);
+          utf8_output += 12;
+          vst1q_u8(utf8_output, utf8_1);
+          utf8_output += 12;
+          buf += 8;
+          continue;
+        }*/
+        const uint8_t mask0 = uint8_t(mask);
+        const uint8_t *row0 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+        const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+        const uint8_t *row1 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+        const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+        vst1q_u8(utf8_output, utf8_0);
+        utf8_output += row0[0];
+        vst1q_u8(utf8_output, utf8_1);
+        utf8_output += row1[0];
+
+        buf += 8;
+      }
+      // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
+      // will produce four UTF-8 bytes.
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFFFF80) == 0) {
+          *utf8_output++ = char(word);
+        } else if ((word & 0xFFFFF800) == 0) {
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xFFFF0000) == 0) {
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char *>(utf8_output));
+          }
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          if (word > 0x10FFFF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char *>(utf8_output));
+          }
+          *utf8_output++ = char((word >> 18) | 0b11110000);
+          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
+    }
+  } // while
+
+  // check for invalid input
+  if (vmaxvq_u16(forbidden_bytemask) != 0) {
+    return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
+  }
+  return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
+}
+
+std::pair<result, char *>
+arm_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
+                                      char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+  const char32_t *start = buf;
+  const char32_t *end = buf + len;
+
+  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+
+  while (buf + 16 + safety_margin < end) {
+    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+    uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
+
+    // Check if no bits set above 16th
+    if (vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
+      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+      // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
+      uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
+      if (vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
+        // 1. pack the bytes
+        // obviously suboptimal.
+        uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
+        // 2. store (8 bytes)
+        vst1_u8(utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 8;
+        utf8_output += 8;
+        continue; // we are done for this round!
+      }
+
+      if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
+        // 1. prepare 2-byte values
+        // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+        // expected output   : [110a|aaaa|10bb|bbbb] x 8
+        const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+        const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+        // t0 = [000a|aaaa|bbbb|bb00]
+        const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
+        // t1 = [000a|aaaa|0000|0000]
+        const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+        // t2 = [0000|0000|00bb|bbbb]
+        const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
+        // t3 = [000a|aaaa|00bb|bbbb]
+        const uint16x8_t t3 = vorrq_u16(t1, t2);
+        // t4 = [110a|aaaa|10bb|bbbb]
+        const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+        // 2. merge ASCII and 2-byte codewords
+        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+        const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+        const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(
+            vbslq_u16(one_byte_bytemask, utf16_packed, t4));
+        // 3. prepare bitmask for 8-bit lookup
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+        const uint16x8_t mask = simdutf_make_uint16x8_t(
+            0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
+#else
+        const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
+                                 0x0002, 0x0008, 0x0020, 0x0080};
+#endif
+        uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+        // 4. pack the bytes
+        const uint8_t *row =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+        const uint8x16_t shuffle = vld1q_u8(row + 1);
+        const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+
+        // 5. store bytes
+        vst1q_u8(utf8_output, utf8_packed);
+
+        // 6. adjust pointers
+        buf += 8;
+        utf8_output += row[0];
+        continue;
+      } else {
+        // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+
+        // check for invalid input
+        const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+        const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
+        const uint16x8_t forbidden_bytemask = vandq_u16(
+            vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800));
+        if (vmaxvq_u16(forbidden_bytemask) != 0) {
+          return std::make_pair(result(error_code::SURROGATE, buf - start),
+                                reinterpret_cast<char *>(utf8_output));
+        }
+
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+        const uint16x8_t dup_even = simdutf_make_uint16x8_t(
+            0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+#else
+        const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
+                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
+#endif
+        /* In this branch we handle three cases:
+          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+          single UFT-8 byte
+          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
+          two UTF-8 bytes
+          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+          three UTF-8 bytes
+
+          We expand the input word (16-bit) into two code units (32-bit), thus
+          we have room for four bytes. However, we need five distinct bit
+          layouts. Note that the last byte in cases #2 and #3 is the same.
+
+          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+          in register t2.
+
+          We precompute byte 1 for case #3 and -- **conditionally** --
+          precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+          they differ by exactly one bit.
+
+          Finally from these two code units we build proper UTF-8 sequence,
+          taking into account the case (i.e, the number of bytes to write).
+        */
+        /**
+         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+         * t2 => [0ccc|cccc] [10cc|cccc]
+         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+         */
+#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+        const uint16x8_t t0 =
+            vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed),
+                                            vreinterpretq_u8_u16(dup_even)));
+        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+        const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+        const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+
+        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+        const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
+        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+        const uint16x8_t s1 =
+            vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
+        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+        const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+        // [00bb|bbbb|0000|aaaa]
+        const uint16x8_t s2 = vorrq_u16(s0, s1s);
+        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+        const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+        const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+        const uint16x8_t one_or_two_bytes_bytemask =
+            vcleq_u16(utf16_packed, v_07ff);
+        const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000),
+                                        one_or_two_bytes_bytemask);
+        const uint16x8_t s4 = veorq_u16(s3, m0);
+#undef simdutf_vec
+
+        // 4. expand code units 16-bit => 32-bit
+        const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+        const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+        // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+        const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+        const uint16x8_t onemask = simdutf_make_uint16x8_t(
+            0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
+        const uint16x8_t twomask = simdutf_make_uint16x8_t(
+            0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
+#else
+        const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
+                                    0x0100, 0x0400, 0x1000, 0x4000};
+        const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
+                                    0x0200, 0x0800, 0x2000, 0x8000};
+#endif
+        const uint16x8_t combined =
+            vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
+                      vandq_u16(one_or_two_bytes_bytemask, twomask));
+        const uint16_t mask = vaddvq_u16(combined);
+        // The following fast path may or may not be beneficial.
+        /*if(mask == 0) {
+          // We only have three-byte code units. Use fast path.
+          const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+          vst1q_u8(utf8_output, utf8_0);
+          utf8_output += 12;
+          vst1q_u8(utf8_output, utf8_1);
+          utf8_output += 12;
+          buf += 8;
+          continue;
+        }*/
+        const uint8_t mask0 = uint8_t(mask);
+
+        const uint8_t *row0 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+        const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+        const uint8_t *row1 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+        const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+        vst1q_u8(utf8_output, utf8_0);
+        utf8_output += row0[0];
+        vst1q_u8(utf8_output, utf8_1);
+        utf8_output += row1[0];
+
+        buf += 8;
+      }
+      // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
+      // will produce four UTF-8 bytes.
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFFFF80) == 0) {
+          *utf8_output++ = char(word);
+        } else if ((word & 0xFFFFF800) == 0) {
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xFFFF0000) == 0) {
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k),
+                reinterpret_cast<char *>(utf8_output));
+          }
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          if (word > 0x10FFFF) {
+            return std::make_pair(
+                result(error_code::TOO_LARGE, buf - start + k),
+                reinterpret_cast<char *>(utf8_output));
+          }
+          *utf8_output++ = char((word >> 18) | 0b11110000);
+          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
+    }
+  } // while
+
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char *>(utf8_output));
+}
+/* end file src/arm64/arm_convert_utf32_to_utf8.cpp */
+
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* begin file src/generic/buf_block_reader.h */
+namespace simdutf {
+namespace arm64 {
+namespace {
+
+// Walks through a buffer in block-sized increments, loading the last part with
+// spaces
+template <size_t STEP_SIZE> struct buf_block_reader {
+public:
+  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  simdutf_really_inline size_t block_index();
+  simdutf_really_inline bool has_full_block() const;
+  simdutf_really_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0
+   * (in which case this function fills the buffer with spaces and returns 0. In
+   * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
+   * block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
+  simdutf_really_inline void advance();
+
+private:
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
+};
+
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char *format_input_text_64(const uint8_t *text) {
+  static char *buf =
+      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char *format_input_text(const simd8x64<uint8_t> &in) {
+  static char *buf =
+      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  in.store(reinterpret_cast<uint8_t *>(buf));
+  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') {
+      buf[i] = '_';
+    }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+simdutf_unused static char *format_mask(uint64_t mask) {
+  static char *buf = reinterpret_cast<char *>(malloc(64 + 1));
+  for (size_t i = 0; i < 64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
+}
+
+template <size_t STEP_SIZE>
+simdutf_really_inline
+buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
+    : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
+      idx{0} {}
+
+template <size_t STEP_SIZE>
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
+  return idx;
+}
+
+template <size_t STEP_SIZE>
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
+}
+
+template <size_t STEP_SIZE>
+simdutf_really_inline const uint8_t *
+buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
+}
+
+template <size_t STEP_SIZE>
+simdutf_really_inline size_t
+buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  if (len == idx) {
+    return 0;
+  } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+  std::memset(dst, 0x20,
+              STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
+                          // to write out 8 or 16 bytes at once.
+  std::memcpy(dst, buf + idx, len - idx);
+  return len - idx;
+}
+
+template <size_t STEP_SIZE>
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
+}
+
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/buf_block_reader.h */
+/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8_validation {
+
+using namespace simd;
+
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
+
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
+}
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
+}
+
+//
+// Return nonzero if there are incomplete multibyte characters at the end of the
+// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
+//
+simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+  // If the previous input's last 3 bytes match this, they're too short (they
+  // ended at EOF):
+  // ... 1111____ 111_____ 11______
+  static const uint8_t max_array[32] = {255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        0b11110000u - 1,
+                                        0b11100000u - 1,
+                                        0b11000000u - 1};
+  const simd8<uint8_t> max_value(
+      &max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
+  return input.gt_bits(max_value);
+}
+
+struct utf8_checker {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
+  // The last input we received
+  simd8<uint8_t> prev_input_block;
+  // Whether the last input we received was incomplete (used for ASCII fast
+  // path)
+  simd8<uint8_t> prev_incomplete;
+
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
+  }
+
+  // The only problem that can happen at EOF is that a multibyte character is
+  // too short or a byte value too large in the last bytes: check_special_cases
+  // only checks for bytes too large in the first of two bytes.
+  simdutf_really_inline void check_eof() {
+    // If the previous block had incomplete UTF-8 characters at the end, an
+    // ASCII block can't possibly finish them.
+    this->error |= this->prev_incomplete;
+  }
+
+  simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
+    if (simdutf_likely(is_ascii(input))) {
+      this->error |= this->prev_incomplete;
+    } else {
+      // you might think that a for-loop would work, but under Visual Studio, it
+      // is not good enough.
+      static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                        (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+      if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+      } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+        this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+      }
+      this->prev_incomplete =
+          is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
+      this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
+    }
+  }
+
+  // do not forget to call check_eof!
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
+
+}; // struct utf8_checker
+} // namespace utf8_validation
+
+using utf8_validation::utf8_checker;
+
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+/* begin file src/generic/utf8_validation/utf8_validator.h */
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8_validation {
+
+/**
+ * Validates that the string is actual UTF-8.
+ */
+template <class checker>
+bool generic_validate_utf8(const uint8_t *input, size_t length) {
+  checker c{};
+  buf_block_reader<64> reader(input, length);
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    c.check_next_input(in);
+    reader.advance();
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  c.check_next_input(in);
+  reader.advance();
+  c.check_eof();
+  return !c.errors();
+}
+
+bool generic_validate_utf8(const char *input, size_t length) {
+  return generic_validate_utf8<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
+}
+
+/**
+ * Validates that the string is actual UTF-8 and stops on errors.
+ */
+template <class checker>
+result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
+  checker c{};
+  buf_block_reader<64> reader(input, length);
+  size_t count{0};
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    c.check_next_input(in);
+    if (c.errors()) {
+      if (count != 0) {
+        count--;
+      } // Sometimes the error is only detected in the next chunk
+      result res = scalar::utf8::rewind_and_validate_with_errors(
+          reinterpret_cast<const char *>(input),
+          reinterpret_cast<const char *>(input + count), length - count);
+      res.count += count;
+      return res;
+    }
+    reader.advance();
+    count += 64;
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  c.check_next_input(in);
+  reader.advance();
+  c.check_eof();
+  if (c.errors()) {
+    if (count != 0) {
+      count--;
+    } // Sometimes the error is only detected in the next chunk
+    result res = scalar::utf8::rewind_and_validate_with_errors(
+        reinterpret_cast<const char *>(input),
+        reinterpret_cast<const char *>(input) + count, length - count);
+    res.count += count;
+    return res;
+  } else {
+    return result(error_code::SUCCESS, length);
+  }
+}
+
+result generic_validate_utf8_with_errors(const char *input, size_t length) {
+  return generic_validate_utf8_with_errors<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
+}
+
+template <class checker>
+bool generic_validate_ascii(const uint8_t *input, size_t length) {
+  buf_block_reader<64> reader(input, length);
+  uint8_t blocks[64]{};
+  simd::simd8x64<uint8_t> running_or(blocks);
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    running_or |= in;
+    reader.advance();
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  running_or |= in;
+  return running_or.is_ascii();
+}
+
+bool generic_validate_ascii(const char *input, size_t length) {
+  return generic_validate_ascii<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
+}
+
+template <class checker>
+result generic_validate_ascii_with_errors(const uint8_t *input, size_t length) {
+  buf_block_reader<64> reader(input, length);
+  size_t count{0};
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    if (!in.is_ascii()) {
+      result res = scalar::ascii::validate_with_errors(
+          reinterpret_cast<const char *>(input + count), length - count);
+      return result(res.error, count + res.count);
+    }
+    reader.advance();
+
+    count += 64;
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  if (!in.is_ascii()) {
+    result res = scalar::ascii::validate_with_errors(
+        reinterpret_cast<const char *>(input + count), length - count);
+    return result(res.error, count + res.count);
+  } else {
+    return result(error_code::SUCCESS, length);
+  }
+}
+
+result generic_validate_ascii_with_errors(const char *input, size_t length) {
+  return generic_validate_ascii_with_errors<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
+}
+
+} // namespace utf8_validation
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_validator.h */
+// transcoding from UTF-8 to UTF-16
+/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8_to_utf16 {
+using namespace simd;
+
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
+
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
+}
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
+}
+
+struct validating_transcoder {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
+
+  validating_transcoder() : error(uint8_t(0)) {}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
+  }
+
+  template <endianness endian>
+  simdutf_really_inline size_t convert(const char *in, size_t size,
+                                       char16_t *utf16_output) {
+    size_t pos = 0;
+    char16_t *start{utf16_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf16<endian>(utf16_output);
+        utf16_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (utf8_continuation_mask & 1) {
+          return 0; // error
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf16<endian>(
+              in + pos, utf8_end_of_code_point_mask, utf16_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      return 0;
+    }
+    if (pos < size) {
+      size_t howmany = scalar::utf8_to_utf16::convert<endian>(
+          in + pos, size - pos, utf16_output);
+      if (howmany == 0) {
+        return 0;
+      }
+      utf16_output += howmany;
+    }
+    return utf16_output - start;
+  }
+
+  template <endianness endian>
+  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+                                                   char16_t *utf16_output) {
+    size_t pos = 0;
+    char16_t *start{utf16_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf16<endian>(utf16_output);
+        utf16_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (errors() || (utf8_continuation_mask & 1)) {
+          // rewind_and_convert_with_errors will seek a potential error from
+          // in+pos onward, with the ability to go back up to pos bytes, and
+          // read size-pos bytes forward.
+          result res =
+              scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+                  pos, in + pos, size - pos, utf16_output);
+          res.count += pos;
+          return res;
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf16<endian>(
+              in + pos, utf8_end_of_code_point_mask, utf16_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res =
+          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+              pos, in + pos, size - pos, utf16_output);
+      res.count += pos;
+      return res;
+    }
+    if (pos < size) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res =
+          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+              pos, in + pos, size - pos, utf16_output);
+      if (res.error) { // In case of error, we want the error position
+        res.count += pos;
+        return res;
+      } else { // In case of success, we want the number of word written
+        utf16_output += res.count;
+      }
+    }
+    return result(error_code::SUCCESS, utf16_output - start);
+  }
+
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
+
+}; // struct utf8_checker
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8_to_utf16 {
+
+using namespace simd;
+
+template <endianness endian>
+simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
+                                         char16_t *utf16_output) noexcept {
+  // The implementation is not specific to haswell and should be moved to the
+  // generic directory.
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    // this loop could be unrolled further. For example, we could process the
+    // mask far more than 64 bytes.
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if (in.is_ascii()) {
+      in.store_ascii_as_utf16<endian>(utf16_output);
+      utf16_output += 64;
+      pos += 64;
+    } else {
+      // Slow path. We hope that the compiler will recognize that this is a slow
+      // path. Anything that is not a continuation mask is a 'leading byte',
+      // that is, the start of a new code point.
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+      // -65 is 0b10111111 in two-complement's, so largest possible continuation
+      // byte
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      // The *start* of code points is not so useful, rather, we want the *end*
+      // of code points.
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times when using solely
+      // the slow/regular path, and at least four times if there are fast paths.
+      while (pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        //
+        // Thus we may allow convert_masked_utf8_to_utf16 to process
+        // more bytes at a time under a fast-path mode where 16 bytes
+        // are consumed at once (e.g., when encountering ASCII).
+        size_t consumed = convert_masked_utf8_to_utf16<endian>(
+            input + pos, utf8_end_of_code_point_mask, utf16_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block. These bytes will be processed again. So we have an
+      // 80% efficiency (in the worst case). In practice we expect an
+      // 85% to 90% efficiency.
+    }
+  }
+  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
+      input + pos, size - pos, utf16_output);
+  return utf16_output - start;
+}
+
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+// transcoding from UTF-8 to UTF-32
+/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
+
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8_to_utf32 {
+using namespace simd;
+
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
+
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
+}
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
+}
+
+struct validating_transcoder {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
+
+  validating_transcoder() : error(uint8_t(0)) {}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
+  }
+
+  simdutf_really_inline size_t convert(const char *in, size_t size,
+                                       char32_t *utf32_output) {
+    size_t pos = 0;
+    char32_t *start{utf32_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 16 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the fourth
+    // last leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf32(utf32_output);
+        utf32_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (utf8_continuation_mask & 1) {
+          return 0; // we have an error
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf32(
+              in + pos, utf8_end_of_code_point_mask, utf32_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      return 0;
+    }
+    if (pos < size) {
+      size_t howmany =
+          scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+      if (howmany == 0) {
+        return 0;
+      }
+      utf32_output += howmany;
+    }
+    return utf32_output - start;
+  }
+
+  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+                                                   char32_t *utf32_output) {
+    size_t pos = 0;
+    char32_t *start{utf32_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the fourth
+    // last leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf32(utf32_output);
+        utf32_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (errors() || (utf8_continuation_mask & 1)) {
+          result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+              pos, in + pos, size - pos, utf32_output);
+          res.count += pos;
+          return res;
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf32(
+              in + pos, utf8_end_of_code_point_mask, utf32_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, utf32_output);
+      res.count += pos;
+      return res;
+    }
+    if (pos < size) {
+      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, utf32_output);
+      if (res.error) { // In case of error, we want the error position
+        res.count += pos;
+        return res;
+      } else { // In case of success, we want the number of word written
+        utf32_output += res.count;
+      }
+    }
+    return result(error_code::SUCCESS, utf32_output - start);
+  }
+
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
+
+}; // struct utf8_checker
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
+/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
+
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8_to_utf32 {
+
+using namespace simd;
+
+simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
+                                         char32_t *utf32_output) noexcept {
+  size_t pos = 0;
+  char32_t *start{utf32_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if (in.is_ascii()) {
+      in.store_ascii_as_utf32(utf32_output);
+      utf32_output += 64;
+      pos += 64;
+    } else {
+      // -65 is 0b10111111 in two-complement's, so largest possible continuation
+      // byte
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      size_t max_starting_point = (pos + 64) - 12;
+      while (pos < max_starting_point) {
+        size_t consumed = convert_masked_utf8_to_utf32(
+            input + pos, utf8_end_of_code_point_mask, utf32_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+    }
+  }
+  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
+                                                       utf32_output);
+  return utf32_output - start;
+}
+
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
+// other functions
+/* begin file src/generic/utf16.h */
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf16 {
+
+template <endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t *in,
+                                               size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos < size / 32 * 32; pos += 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    if (!match_system(big_endian)) {
+      input.swap_bytes();
+    }
+    uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+    count += count_ones(not_pair) / 2;
+  }
+  return count +
+         scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
+}
+
+template <endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in,
+                                                    size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  // This algorithm could no doubt be improved!
+  for (; pos < size / 32 * 32; pos += 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    if (!match_system(big_endian)) {
+      input.swap_bytes();
+    }
+    uint64_t ascii_mask = input.lteq(0x7F);
+    uint64_t twobyte_mask = input.lteq(0x7FF);
+    uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+
+    size_t ascii_count = count_ones(ascii_mask) / 2;
+    size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
+    size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
+    size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+    count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count +
+             ascii_count;
+  }
+  return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
+                                                                   size - pos);
+}
+
+template <endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
+                                                     size_t size) {
+  return count_code_points<big_endian>(in, size);
+}
+
+simdutf_really_inline void
+change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
+  size_t pos = 0;
+
+  while (pos < size / 32 * 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    input.swap_bytes();
+    input.store(reinterpret_cast<uint16_t *>(output));
+    pos += 32;
+    output += 32;
+  }
+
+  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+}
+
+} // namespace utf16
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf16.h */
+/* begin file src/generic/utf8.h */
+
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8 {
+
+using namespace simd;
+
+simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos + 64 <= size; pos += 64) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    uint64_t utf8_continuation_mask = input.gt(-65);
+    count += count_ones(utf8_continuation_mask);
+  }
+  return count + scalar::utf8::count_code_points(in + pos, size - pos);
+}
+
+simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
+                                                    size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  // This algorithm could no doubt be improved!
+  for (; pos + 64 <= size; pos += 64) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+    // We count one word for anything that is not a continuation (so
+    // leading bytes).
+    count += 64 - count_ones(utf8_continuation_mask);
+    int64_t utf8_4byte = input.gteq_unsigned(240);
+    count += count_ones(utf8_4byte);
+  }
+  return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
+}
+} // namespace utf8
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf8.h */
+// transcoding from UTF-8 to Latin 1
+/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
+
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8_to_latin1 {
+using namespace simd;
+
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // For UTF-8 to Latin 1, we can allow any ASCII character, and any
+  // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
+  // 0b11000010 and nothing else.
+  //
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+  constexpr const uint8_t FORBIDDEN = 0xff;
+
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      FORBIDDEN,
+      // 1110____ ________ <three byte lead in byte 1>
+      FORBIDDEN,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      FORBIDDEN);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
+
+              // ____0100 ________
+              FORBIDDEN,
+              // ____0101 ________
+              FORBIDDEN,
+              // ____011_ ________
+              FORBIDDEN, FORBIDDEN,
+
+              // ____1___ ________
+              FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
+              // ____1101 ________
+              FORBIDDEN, FORBIDDEN, FORBIDDEN);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
+}
+
+struct validating_transcoder {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
+
+  validating_transcoder() : error(uint8_t(0)) {}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    this->error |= check_special_cases(input, prev1);
+  }
+
+  simdutf_really_inline size_t convert(const char *in, size_t size,
+                                       char *latin1_output) {
+    size_t pos = 0;
+    char *start{latin1_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 16 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 16; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) >
+                       -65); // twos complement of -65 is 1011 1111 ...
+    }
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store((int8_t *)latin1_output);
+        latin1_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask =
+            input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
+                               // this case, we also have ASCII to account for.
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_latin1(
+              in + pos, utf8_end_of_code_point_mask, latin1_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      return 0;
+    }
+    if (pos < size) {
+      size_t howmany =
+          scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
+      if (howmany == 0) {
+        return 0;
+      }
+      latin1_output += howmany;
+    }
+    return latin1_output - start;
+  }
+
+  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+                                                   char *latin1_output) {
+    size_t pos = 0;
+    char *start{latin1_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store((int8_t *)latin1_output);
+        latin1_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        if (errors()) {
+          // rewind_and_convert_with_errors will seek a potential error from
+          // in+pos onward, with the ability to go back up to pos bytes, and
+          // read size-pos bytes forward.
+          result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+              pos, in + pos, size - pos, latin1_output);
+          res.count += pos;
+          return res;
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_latin1(
+              in + pos, utf8_end_of_code_point_mask, latin1_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, latin1_output);
+      res.count += pos;
+      return res;
+    }
+    if (pos < size) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, latin1_output);
+      if (res.error) { // In case of error, we want the error position
+        res.count += pos;
+        return res;
+      } else { // In case of success, we want the number of word written
+        latin1_output += res.count;
+      }
+    }
+    return result(error_code::SUCCESS, latin1_output - start);
+  }
+
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
+
+}; // struct utf8_checker
+} // namespace utf8_to_latin1
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
+/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
+
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8_to_latin1 {
+using namespace simd;
+
+simdutf_really_inline size_t convert_valid(const char *in, size_t size,
+                                           char *latin1_output) {
+  size_t pos = 0;
+  char *start{latin1_output};
+  // In the worst case, we have the haswell kernel which can cause an overflow
+  // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
+  // 16 bytes, and if the data is valid, then it is entirely safe because 16
+  // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
+  // assume that you have valid UTF-8 input, so we are going to go back from the
+  // end counting 8 leading bytes, to give us a good margin.
+  size_t leading_byte = 0;
+  size_t margin = size;
+  for (; margin > 0 && leading_byte < 8; margin--) {
+    leading_byte += (int8_t(in[margin - 1]) >
+                     -65); // twos complement of -65 is 1011 1111 ...
+  }
+  // If the input is long enough, then we have that margin-1 is the eight last
+  // leading byte.
+  const size_t safety_margin = size - margin + 1; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    if (input.is_ascii()) {
+      input.store((int8_t *)latin1_output);
+      latin1_output += 64;
+      pos += 64;
+    } else {
+      // you might think that a for-loop would work, but under Visual Studio, it
+      // is not good enough.
+      uint64_t utf8_continuation_mask =
+          input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
+                             // this case, we also have ASCII to account for.
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times.
+      while (pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        size_t consumed = convert_masked_utf8_to_latin1(
+            in + pos, utf8_end_of_code_point_mask, latin1_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block. These bytes will be processed again. So we have an
+      // 80% efficiency (in the worst case). In practice we expect an
+      // 85% to 90% efficiency.
+    }
+  }
+  if (pos < size) {
+    size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
+                                                           latin1_output);
+    latin1_output += howmany;
+  }
+  return latin1_output - start;
+}
+
+} // namespace utf8_to_latin1
+} // namespace
+} // namespace arm64
+} // namespace simdutf
+  // namespace simdutf
+/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
+
+// placeholder scalars
+
+//
+// Implementation-specific overrides
+//
+namespace simdutf {
+namespace arm64 {
+
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+                                 size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  if (bom_encoding != encoding_type::unspecified) {
+    return bom_encoding;
+  }
+  // todo: reimplement as a one-pass algorithm.
+  int out = 0;
+  if (validate_utf8(input, length)) {
+    out |= encoding_type::UTF8;
+  }
+  if ((length % 2) == 0) {
+    if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
+                         length / 2)) {
+      out |= encoding_type::UTF16_LE;
+    }
+  }
+  if ((length % 4) == 0) {
+    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+      out |= encoding_type::UTF32_LE;
+    }
+  }
+  return out;
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return arm64::utf8_validation::generic_validate_utf8(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+    const char *buf, size_t len) const noexcept {
+  return arm64::utf8_validation::generic_validate_utf8_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+  return arm64::utf8_validation::generic_validate_ascii(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+    const char *buf, size_t len) const noexcept {
+  return arm64::utf8_validation::generic_validate_ascii_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16le(const char16_t *buf,
+                                 size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid. protected the implementation from nullptr.
+    return true;
+  }
+  const char16_t *tail = arm_validate_utf16<endianness::LITTLE>(buf, len);
+  if (tail) {
+    return scalar::utf16::validate<endianness::LITTLE>(tail,
+                                                       len - (tail - buf));
+  } else {
+    return false;
+  }
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16be(const char16_t *buf,
+                                 size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid. protected the implementation from nullptr.
+    return true;
+  }
+  const char16_t *tail = arm_validate_utf16<endianness::BIG>(buf, len);
+  if (tail) {
+    return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
+}
+
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
+  result res = arm_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+  if (res.count != len) {
+    result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(
+        buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
+}
+
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
+  result res = arm_validate_utf16_with_errors<endianness::BIG>(buf, len);
+  if (res.count != len) {
+    result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(
+        buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid. protected the implementation from nullptr.
+    return true;
+  }
+  const char32_t *tail = arm_validate_utf32le(buf, len);
+  if (tail) {
+    return scalar::utf32::validate(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
+}
+
+simdutf_warn_unused result implementation::validate_utf32_with_errors(
+    const char32_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
+  result res = arm_validate_utf32le_with_errors(buf, len);
+  if (res.count != len) {
+    result scalar_res =
+        scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
+    const char *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char *, char *> ret =
+      arm_convert_latin1_to_utf8(buf, len, utf8_output);
+  size_t converted_chars = ret.second - utf8_output;
+
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    converted_chars += scalar_converted_chars;
+  }
+  return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char *, char16_t *> ret =
+      arm_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+  size_t converted_chars = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars =
+        scalar::latin1_to_utf16::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    converted_chars += scalar_converted_chars;
+  }
+  return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char *, char16_t *> ret =
+      arm_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
+  size_t converted_chars = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars =
+        scalar::latin1_to_utf16::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    converted_chars += scalar_converted_chars;
+  }
+  return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::pair<const char *, char32_t *> ret =
+      arm_convert_latin1_to_utf32(buf, len, utf32_output);
+  size_t converted_chars = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    converted_chars += scalar_converted_chars;
+  }
+  return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  utf8_to_latin1::validating_transcoder converter;
+  return converter.convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  utf8_to_latin1::validating_transcoder converter;
+  return converter.convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  return arm64::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert_with_errors<endianness::LITTLE>(buf, len,
+                                                           utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+    const char *input, size_t size, char16_t *utf16_output) const noexcept {
+  return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,
+                                                          utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+    const char *input, size_t size, char16_t *utf16_output) const noexcept {
+  return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,
+                                                       utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  utf8_to_utf32::validating_transcoder converter;
+  return converter.convert(buf, len, utf32_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  utf8_to_utf32::validating_transcoder converter;
+  return converter.convert_with_errors(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+    const char *input, size_t size, char32_t *utf32_output) const noexcept {
+  return utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      arm_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
+
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_latin1::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      arm_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
+
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_latin1::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16le_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<result, char *> ret =
+      arm_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
+          buf, len, latin1_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      latin1_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16be_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<result, char *> ret =
+      arm_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
+                                                               latin1_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      latin1_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  // optimization opportunity: implement a custom function.
+  return convert_utf16be_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  // optimization opportunity: implement a custom function.
+  return convert_utf16le_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      arm_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf8::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      arm_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf8::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char *> ret =
+      arm_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len,
+                                                                utf8_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf8_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char *> ret =
+      arm_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len,
+                                                             utf8_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf8_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return convert_utf16le_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return convert_utf16be_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return 0;
+  }
+  std::pair<const char32_t *, char *> ret =
+      arm_convert_utf32_to_utf8(buf, len, utf8_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char *> ret =
+      arm_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+  if (ret.first.count != len) {
+    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf8_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::pair<const char16_t *, char32_t *> ret =
+      arm_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::pair<const char16_t *, char32_t *> ret =
+      arm_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf32::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char32_t *> ret =
+      arm_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len,
+                                                                 utf32_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf32_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char32_t *> ret =
+      arm_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len,
+                                                              utf32_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf32_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char32_t *, char *> ret =
+      arm_convert_utf32_to_latin1(buf, len, latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
+
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<result, char *> ret =
+      arm_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
+        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      latin1_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char32_t *, char *> ret =
+      arm_convert_utf32_to_latin1(buf, len, latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
+
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert_valid(
+        ret.first, len - (ret.first - buf), ret.second);
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  // optimization opportunity: implement a custom function.
+  return convert_utf32_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char32_t *, char16_t *> ret =
+      arm_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char32_t *, char16_t *> ret =
+      arm_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf32_to_utf16::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char16_t *> ret =
+      arm_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len,
+                                                                 utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res =
+        scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf16_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char16_t *> ret =
+      arm_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len,
+                                                              utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res =
+        scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf16_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return convert_utf32_to_utf16le(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return convert_utf32_to_utf16be(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return convert_utf16le_to_utf32(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return convert_utf16be_to_utf32(buf, len, utf32_output);
+}
+
+void implementation::change_endianness_utf16(const char16_t *input,
+                                             size_t length,
+                                             char16_t *output) const noexcept {
+  utf16::change_endianness_utf16(input, length, output);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::count_code_points<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::count_code_points<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *input, size_t length) const noexcept {
+  return utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
+    const char *buf, size_t len) const noexcept {
+  return count_utf8(buf, len);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf16(size_t length) const noexcept {
+  return scalar::utf16::latin1_length_from_utf16(length);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf32(size_t length) const noexcept {
+  return scalar::utf32::latin1_length_from_utf32(length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+    const char *input, size_t length) const noexcept {
+  // See
+  // https://lemire.me/blog/2023/05/15/computing-the-utf-8-size-of-a-latin-1-string-quickly-arm-neon-edition/
+  // credit to Pete Cawley
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
+  uint64_t result = 0;
+  const int lanes = sizeof(uint8x16_t);
+  uint8_t rem = length % lanes;
+  const uint8_t *simd_end = data + (length / lanes) * lanes;
+  const uint8x16_t threshold = vdupq_n_u8(0x80);
+  for (; data < simd_end; data += lanes) {
+    // load 16 bytes
+    uint8x16_t input_vec = vld1q_u8(data);
+    // compare to threshold (0x80)
+    uint8x16_t withhighbit = vcgeq_u8(input_vec, threshold);
+    // vertical addition
+    result -= vaddvq_s8(vreinterpretq_s8_u8(withhighbit));
+  }
+  return result + (length / lanes) * lanes +
+         scalar::latin1::utf8_length_from_latin1((const char *)simd_end, rem);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::utf16_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf16_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t
+implementation::utf32_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf32_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return utf8::utf16_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  const uint32x4_t v_7f = vmovq_n_u32((uint32_t)0x7f);
+  const uint32x4_t v_7ff = vmovq_n_u32((uint32_t)0x7ff);
+  const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
+  const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos + 4 <= length; pos += 4) {
+    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
+    const uint32x4_t ascii_bytes_bytemask = vcleq_u32(in, v_7f);
+    const uint32x4_t one_two_bytes_bytemask = vcleq_u32(in, v_7ff);
+    const uint32x4_t two_bytes_bytemask =
+        veorq_u32(one_two_bytes_bytemask, ascii_bytes_bytemask);
+    const uint32x4_t three_bytes_bytemask =
+        veorq_u32(vcleq_u32(in, v_ffff), one_two_bytes_bytemask);
+
+    const uint16x8_t reduced_ascii_bytes_bytemask =
+        vreinterpretq_u16_u32(vandq_u32(ascii_bytes_bytemask, v_1));
+    const uint16x8_t reduced_two_bytes_bytemask =
+        vreinterpretq_u16_u32(vandq_u32(two_bytes_bytemask, v_1));
+    const uint16x8_t reduced_three_bytes_bytemask =
+        vreinterpretq_u16_u32(vandq_u32(three_bytes_bytemask, v_1));
+
+    const uint16x8_t compressed_bytemask0 =
+        vpaddq_u16(reduced_ascii_bytes_bytemask, reduced_two_bytes_bytemask);
+    const uint16x8_t compressed_bytemask1 =
+        vpaddq_u16(reduced_three_bytes_bytemask, reduced_three_bytes_bytemask);
+
+    size_t ascii_count = count_ones(
+        vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 0));
+    size_t two_bytes_count = count_ones(
+        vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 1));
+    size_t three_bytes_count = count_ones(
+        vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask1), 0));
+
+    count += 16 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
+  }
+  return count +
+         scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
+  const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos + 4 <= length; pos += 4) {
+    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
+    const uint32x4_t surrogate_bytemask = vcgtq_u32(in, v_ffff);
+    const uint16x8_t reduced_bytemask =
+        vreinterpretq_u16_u32(vandq_u32(surrogate_bytemask, v_1));
+    const uint16x8_t compressed_bytemask =
+        vpaddq_u16(reduced_bytemask, reduced_bytemask);
+    size_t surrogate_count = count_ones(
+        vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask), 0));
+    count += 4 + surrogate_count;
+  }
+  return count +
+         scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+    const char *input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+    const char *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+    const char *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
+}
+
+simdutf_warn_unused size_t implementation::base64_length_from_binary(
+    size_t length, base64_options options) const noexcept {
+  return scalar::base64::base64_length_from_binary(length, options);
+}
+
+size_t implementation::binary_to_base64(const char *input, size_t length,
+                                        char *output,
+                                        base64_options options) const noexcept {
+  return encode_base64(output, input, length, options);
+}
+
+} // namespace arm64
+} // namespace simdutf
+
+/* begin file src/simdutf/arm64/end.h */
+/* end file src/simdutf/arm64/end.h */
+/* end file src/arm64/implementation.cpp */
+#endif
+#if SIMDUTF_IMPLEMENTATION_FALLBACK
+/* begin file src/fallback/implementation.cpp */
+/* begin file src/simdutf/fallback/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "fallback"
+// #define SIMDUTF_IMPLEMENTATION fallback
+/* end file src/simdutf/fallback/begin.h */
+
+
+
+
+
+
+
+
+#include <cstdint>
+#include <cstring>
+
+namespace simdutf {
+namespace fallback {
+
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+                                 size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  if (bom_encoding != encoding_type::unspecified) {
+    return bom_encoding;
+  }
+  // todo: reimplement as a one-pass algorithm.
+  int out = 0;
+  if (validate_utf8(input, length)) {
+    out |= encoding_type::UTF8;
+  }
+  if ((length % 2) == 0) {
+    if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
+                         length / 2)) {
+      out |= encoding_type::UTF16_LE;
+    }
+  }
+  if ((length % 4) == 0) {
+    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+      out |= encoding_type::UTF32_LE;
+    }
+  }
+  return out;
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return scalar::utf8::validate(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+    const char *buf, size_t len) const noexcept {
+  return scalar::utf8::validate_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+  return scalar::ascii::validate(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+    const char *buf, size_t len) const noexcept {
+  return scalar::ascii::validate_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16le(const char16_t *buf,
+                                 size_t len) const noexcept {
+  return scalar::utf16::validate<endianness::LITTLE>(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16be(const char16_t *buf,
+                                 size_t len) const noexcept {
+  return scalar::utf16::validate<endianness::BIG>(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+  return scalar::utf32::validate(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf32_with_errors(
+    const char32_t *buf, size_t len) const noexcept {
+  return scalar::utf32::validate_with_errors(buf, len);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
+    const char *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len,
+                                                              utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len,
+                                                           utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::latin1_to_utf32::convert(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf8_to_utf16::convert<endianness::LITTLE>(buf, len,
+                                                            utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf8_to_utf16::convert<endianness::BIG>(buf, len,
+                                                         utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(
+      buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(
+      buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(buf, len,
+                                                                  utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf8_to_utf16::convert_valid<endianness::BIG>(buf, len,
+                                                               utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf8_to_utf32::convert(buf, len, utf32_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf8_to_utf32::convert_with_errors(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+    const char *input, size_t size, char32_t *utf32_output) const noexcept {
+  return scalar::utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len,
+                                                              latin1_output);
+}
 
-// 1 byte for length, 16 bytes for mask
-const uint8_t pack_1_2_3_utf8_bytes[256][17] = {
-    {12, 2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80},
-    {9, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {11, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {9, 2, 3, 1, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {8, 3, 1, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 0, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {11, 2, 3, 1, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {8, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {10, 3, 1, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {9, 0, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {10, 2, 3, 1, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {7, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {9, 3, 1, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {8, 0, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {9, 2, 3, 1, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {6, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {8, 3, 1, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 0, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 2, 3, 1, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {3, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {5, 3, 1, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 0, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {8, 2, 3, 1, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 2, 3, 1, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {6, 3, 1, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {5, 0, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {11, 2, 3, 1, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {8, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {10, 3, 1, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {9, 0, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {8, 2, 3, 1, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {10, 2, 3, 1, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {7, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {9, 3, 1, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {8, 0, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {9, 2, 3, 1, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {8, 3, 1, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 0, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {10, 2, 3, 1, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {7, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {9, 3, 1, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {8, 0, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 2, 3, 1, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {6, 3, 1, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {5, 0, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {9, 2, 3, 1, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {8, 3, 1, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 0, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {8, 2, 3, 1, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {9, 2, 3, 1, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {6, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {8, 3, 1, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 0, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 2, 3, 1, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {3, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {5, 3, 1, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 0, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {8, 2, 3, 1, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 2, 3, 1, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {6, 3, 1, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {5, 0, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {6, 2, 3, 1, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {3, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {5, 3, 1, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 0, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {3, 2, 3, 1, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80, 0x80},
-    {2, 3, 1, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80, 0x80},
-    {5, 2, 3, 1, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {2, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {4, 3, 1, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {3, 0, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {4, 2, 3, 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80, 0x80},
-    {3, 3, 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {2, 0, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {8, 2, 3, 1, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 2, 3, 1, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {2, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {4, 3, 1, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {3, 0, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {7, 2, 3, 1, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {6, 3, 1, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 0, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {6, 2, 3, 1, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {3, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {5, 3, 1, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 0, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 2, 3, 1, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {6, 3, 1, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 0, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 2, 3, 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80, 0x80},
-    {3, 3, 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {2, 0, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {6, 2, 3, 1, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {3, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {5, 3, 1, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 0, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {5, 2, 3, 1, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {2, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {4, 3, 1, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {3, 0, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {11, 2, 3, 1, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {8, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {10, 3, 1, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {9, 0, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {8, 2, 3, 1, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {10, 2, 3, 1, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {7, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {9, 3, 1, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {8, 0, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {9, 2, 3, 1, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {8, 3, 1, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 0, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {8, 2, 3, 1, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {5, 2, 3, 1, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {2, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80, 0x80},
-    {4, 3, 1, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {3, 0, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {7, 2, 3, 1, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {6, 3, 1, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {5, 0, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {6, 2, 3, 1, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {3, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {5, 3, 1, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 0, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {10, 2, 3, 1, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {7, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {9, 3, 1, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {8, 0, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 2, 3, 1, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {6, 3, 1, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {5, 0, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {9, 2, 3, 1, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {8, 3, 1, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 0, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {8, 2, 3, 1, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {9, 2, 3, 1, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {6, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {8, 3, 1, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 0, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 2, 3, 1, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {3, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {5, 3, 1, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 0, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {8, 2, 3, 1, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 2, 3, 1, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {6, 3, 1, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {5, 0, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {10, 2, 3, 1, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {7, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {9, 3, 1, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {8, 0, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 2, 3, 1, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {6, 3, 1, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {5, 0, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {9, 2, 3, 1, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {8, 3, 1, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 0, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {8, 2, 3, 1, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 2, 3, 1, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {6, 3, 1, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 0, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 2, 3, 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80, 0x80},
-    {3, 3, 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {2, 0, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {6, 2, 3, 1, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {3, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {5, 3, 1, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 0, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {5, 2, 3, 1, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {2, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {4, 3, 1, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {3, 0, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {9, 2, 3, 1, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {6, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {8, 3, 1, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 0, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 2, 3, 1, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {3, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {5, 3, 1, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 0, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {8, 2, 3, 1, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 2, 3, 1, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {6, 3, 1, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {5, 0, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {8, 2, 3, 1, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 2, 3, 1, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {2, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {4, 3, 1, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {3, 0, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {7, 2, 3, 1, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {6, 3, 1, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 0, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {6, 2, 3, 1, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {3, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {5, 3, 1, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 0, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80}};
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len,
+                                                           latin1_output);
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16le_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
+      buf, len, latin1_output);
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16be_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
+      buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(
+      buf, len, latin1_output);
+}
 
-} // namespace utf16_to_utf8
-} // namespace tables
-} // unnamed namespace
-} // namespace simdutf
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len,
+                                                                 latin1_output);
+}
 
-#endif // SIMDUTF_UTF16_TO_UTF8_TABLES_H
-/* end file src/tables/utf16_to_utf8_tables.h */
-// End of tables.
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len,
+                                                            utf8_output);
+}
 
-// The scalar routines should be included once.
-/* begin file src/scalar/ascii.h */
-#ifndef SIMDUTF_ASCII_H
-#define SIMDUTF_ASCII_H
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
+}
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace ascii {
-#if SIMDUTF_IMPLEMENTATION_FALLBACK
-// Only used by the fallback kernel.
-inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  uint64_t pos = 0;
-  // process in blocks of 16 bytes when possible
-  for (; pos + 16 <= len; pos += 16) {
-    uint64_t v1;
-    std::memcpy(&v1, data + pos, sizeof(uint64_t));
-    uint64_t v2;
-    std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-    uint64_t v{v1 | v2};
-    if ((v & 0x8080808080808080) != 0) {
-      return false;
-    }
-  }
-  // process the tail byte-by-byte
-  for (; pos < len; pos++) {
-    if (data[pos] >= 0b10000000) {
-      return false;
-    }
-  }
-  return true;
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+      buf, len, utf8_output);
 }
-#endif
 
-inline simdutf_warn_unused result validate_with_errors(const char *buf,
-                                                       size_t len) noexcept {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  // process in blocks of 16 bytes when possible
-  for (; pos + 16 <= len; pos += 16) {
-    uint64_t v1;
-    std::memcpy(&v1, data + pos, sizeof(uint64_t));
-    uint64_t v2;
-    std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-    uint64_t v{v1 | v2};
-    if ((v & 0x8080808080808080) != 0) {
-      for (; pos < len; pos++) {
-        if (data[pos] >= 0b10000000) {
-          return result(error_code::TOO_LARGE, pos);
-        }
-      }
-    }
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+      buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len,
+                                                                  utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len,
+                                                               utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len,
+                                                             utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len,
+                                                          utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+      buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+      buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(
+      buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len,
+                                                                utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len,
+                                                             utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len,
+                                                          utf32_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+      buf, len, utf32_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+      buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(
+      buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len,
+                                                                utf32_output);
+}
+
+void implementation::change_endianness_utf16(const char16_t *input,
+                                             size_t length,
+                                             char16_t *output) const noexcept {
+  scalar::utf16::change_endianness_utf16(input, length, output);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::count_code_points<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *input, size_t length) const noexcept {
+  return scalar::utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
+    const char *buf, size_t len) const noexcept {
+  return scalar::utf8::count_code_points(buf, len);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf16(size_t length) const noexcept {
+  return scalar::utf16::latin1_length_from_utf16(length);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf32(size_t length) const noexcept {
+  return length;
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+    const char *input, size_t length) const noexcept {
+  size_t answer = length;
+  size_t i = 0;
+  auto pop = [](uint64_t v) {
+    return (size_t)(((v >> 7) & UINT64_C(0x0101010101010101)) *
+                        UINT64_C(0x0101010101010101) >>
+                    56);
+  };
+  for (; i + 32 <= length; i += 32) {
+    uint64_t v;
+    memcpy(&v, input + i, 8);
+    answer += pop(v);
+    memcpy(&v, input + i + 8, sizeof(v));
+    answer += pop(v);
+    memcpy(&v, input + i + 16, sizeof(v));
+    answer += pop(v);
+    memcpy(&v, input + i + 24, sizeof(v));
+    answer += pop(v);
   }
-  // process the tail byte-by-byte
-  for (; pos < len; pos++) {
-    if (data[pos] >= 0b10000000) {
-      return result(error_code::TOO_LARGE, pos);
-    }
+  for (; i + 8 <= length; i += 8) {
+    uint64_t v;
+    memcpy(&v, input + i, sizeof(v));
+    answer += pop(v);
   }
-  return result(error_code::SUCCESS, pos);
+  for (; i + 1 <= length; i += 1) {
+    answer += static_cast<uint8_t>(input[i]) >> 7;
+  }
+  return answer;
 }
 
-} // namespace ascii
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input,
+                                                                   length);
+}
 
-#endif
-/* end file src/scalar/ascii.h */
-/* begin file src/scalar/latin1.h */
-#ifndef SIMDUTF_LATIN1_H
-#define SIMDUTF_LATIN1_H
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+}
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace latin1 {
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input,
+                                                                    length);
+}
 
-inline size_t utf32_length_from_latin1(size_t len) {
-  // We are not BOM aware.
-  return len; // a utf32 unit will always represent 1 latin1 character
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
 }
 
-inline size_t utf8_length_from_latin1(const char *buf, size_t len) {
-  const uint8_t *c = reinterpret_cast<const uint8_t *>(buf);
-  size_t answer = 0;
-  for (size_t i = 0; i < len; i++) {
-    if ((c[i] >> 7)) {
-      answer++;
-    }
-  }
-  return answer + len;
+simdutf_warn_unused size_t
+implementation::utf16_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf16_length_from_latin1(length);
 }
 
-inline size_t utf16_length_from_latin1(size_t len) { return len; }
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return scalar::utf8::utf16_length_from_utf8(input, length);
+}
 
-} // namespace latin1
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  return scalar::utf32::utf8_length_from_utf32(input, length);
+}
 
-#endif
-/* end file src/scalar/latin1.h */
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  return scalar::utf32::utf16_length_from_utf32(input, length);
+}
 
-/* begin file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
-#ifndef SIMDUTF_VALID_UTF32_TO_UTF8_H
-#define SIMDUTF_VALID_UTF32_TO_UTF8_H
+simdutf_warn_unused size_t
+implementation::utf32_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf32_length_from_latin1(length);
+}
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf32_to_utf8 {
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return scalar::utf8::count_code_points(input, length);
+}
 
-#if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
-// only used by the fallback and POWER kernel
-inline size_t convert_valid(const char32_t *buf, size_t len,
-                            char *utf8_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char *start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 2 ASCII characters
-    if (pos + 2 <=
-        len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
-        *utf8_output++ = char(buf[pos]);
-        *utf8_output++ = char(buf[pos + 1]);
-        pos += 2;
-        continue;
-      }
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+    const char *input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+    const char *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  while (length > 0 &&
+         scalar::base64::is_ascii_white_space(input[length - 1])) {
+    length--;
+  }
+  size_t equallocation =
+      length; // location of the first padding character if any
+  size_t equalsigns = 0;
+  if (length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
+    length -= 1;
+    equalsigns++;
+    while (length > 0 &&
+           scalar::base64::is_ascii_white_space(input[length - 1])) {
+      length--;
+    }
+    if (length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
+      equalsigns++;
+      length -= 1;
+    }
+  }
+  if (length == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation};
+    }
+    return {SUCCESS, 0};
+  }
+  result r = scalar::base64::base64_tail_decode(
+      output, input, length, equalsigns, options, last_chunk_options);
+  if (last_chunk_options != stop_before_partial &&
+      r.error == error_code::SUCCESS && equalsigns > 0) {
+    // additional checks
+    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation};
+    }
+  }
+  return r;
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+    const char *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  while (length > 0 &&
+         scalar::base64::is_ascii_white_space(input[length - 1])) {
+    length--;
+  }
+  size_t equallocation =
+      length; // location of the first padding character if any
+  size_t equalsigns = 0;
+  if (length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
+    length -= 1;
+    equalsigns++;
+    while (length > 0 &&
+           scalar::base64::is_ascii_white_space(input[length - 1])) {
+      length--;
+    }
+    if (length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
+      equalsigns++;
+      length -= 1;
+    }
+  }
+  if (length == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation, 0};
+    }
+    return {SUCCESS, 0, 0};
+  }
+  full_result r = scalar::base64::base64_tail_decode(
+      output, input, length, equalsigns, options, last_chunk_options);
+  if (last_chunk_options != stop_before_partial &&
+      r.error == error_code::SUCCESS && equalsigns > 0) {
+    // additional checks
+    if ((r.output_count % 3 == 0) ||
+        ((r.output_count % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
+    }
+  }
+  return r;
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  while (length > 0 &&
+         scalar::base64::is_ascii_white_space(input[length - 1])) {
+    length--;
+  }
+  size_t equallocation =
+      length; // location of the first padding character if any
+  size_t equalsigns = 0;
+  if (length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
+    length -= 1;
+    equalsigns++;
+    while (length > 0 &&
+           scalar::base64::is_ascii_white_space(input[length - 1])) {
+      length--;
+    }
+    if (length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
+      equalsigns++;
+      length -= 1;
+    }
+  }
+  if (length == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation};
+    }
+    return {SUCCESS, 0};
+  }
+  result r = scalar::base64::base64_tail_decode(
+      output, input, length, equalsigns, options, last_chunk_options);
+  if (last_chunk_options != stop_before_partial &&
+      r.error == error_code::SUCCESS && equalsigns > 0) {
+    // additional checks
+    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation};
+    }
+  }
+  return r;
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  while (length > 0 &&
+         scalar::base64::is_ascii_white_space(input[length - 1])) {
+    length--;
+  }
+  size_t equallocation =
+      length; // location of the first padding character if any
+  size_t equalsigns = 0;
+  if (length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
+    length -= 1;
+    equalsigns++;
+    while (length > 0 &&
+           scalar::base64::is_ascii_white_space(input[length - 1])) {
+      length--;
     }
-    uint32_t word = data[pos];
-    if ((word & 0xFFFFFF80) == 0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if ((word & 0xFFFFF800) == 0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word >> 6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if ((word & 0xFFFF0000) == 0) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((word >> 12) | 0b11100000);
-      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else {
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((word >> 18) | 0b11110000);
-      *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
+    if (length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
+      equalsigns++;
+      length -= 1;
     }
   }
-  return utf8_output - start;
+  if (length == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation, 0};
+    }
+    return {SUCCESS, 0, 0};
+  }
+  full_result r = scalar::base64::base64_tail_decode(
+      output, input, length, equalsigns, options, last_chunk_options);
+  if (last_chunk_options != stop_before_partial &&
+      r.error == error_code::SUCCESS && equalsigns > 0) {
+    // additional checks
+    if ((r.output_count % 3 == 0) ||
+        ((r.output_count % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
+    }
+  }
+  return r;
 }
-#endif // SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
 
-} // namespace utf32_to_utf8
-} // unnamed namespace
-} // namespace scalar
+simdutf_warn_unused size_t implementation::base64_length_from_binary(
+    size_t length, base64_options options) const noexcept {
+  return scalar::base64::base64_length_from_binary(length, options);
+}
+
+size_t implementation::binary_to_base64(const char *input, size_t length,
+                                        char *output,
+                                        base64_options options) const noexcept {
+  return scalar::base64::tail_encode_base64(output, input, length, options);
+}
+} // namespace fallback
 } // namespace simdutf
 
+/* begin file src/simdutf/fallback/end.h */
+/* end file src/simdutf/fallback/end.h */
+/* end file src/fallback/implementation.cpp */
 #endif
-/* end file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
-/* begin file src/scalar/utf32_to_utf8/utf32_to_utf8.h */
-#ifndef SIMDUTF_UTF32_TO_UTF8_H
-#define SIMDUTF_UTF32_TO_UTF8_H
+#if SIMDUTF_IMPLEMENTATION_ICELAKE
+/* begin file src/icelake/implementation.cpp */
+
 
+/* begin file src/simdutf/icelake/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "icelake"
+// #define SIMDUTF_IMPLEMENTATION icelake
+
+#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
+// nothing needed.
+#else
+SIMDUTF_TARGET_ICELAKE
+#endif
+
+#if SIMDUTF_GCC11ORMORE // workaround for
+                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
+// clang-format off
+SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
+// clang-format on
+#endif // end of workaround
+/* end file src/simdutf/icelake/begin.h */
 namespace simdutf {
-namespace scalar {
+namespace icelake {
 namespace {
-namespace utf32_to_utf8 {
+#ifndef SIMDUTF_ICELAKE_H
+  #error "icelake.h must be included"
+#endif
+/* begin file src/icelake/icelake_utf8_common.inl.cpp */
+// Common procedures for both validating and non-validating conversions from
+// UTF-8.
+enum block_processing_mode { SIMDUTF_FULL, SIMDUTF_TAIL };
 
-inline size_t convert(const char32_t *buf, size_t len, char *utf8_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char *start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 2 ASCII characters
-    if (pos + 2 <=
-        len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
-        *utf8_output++ = char(buf[pos]);
-        *utf8_output++ = char(buf[pos + 1]);
-        pos += 2;
-        continue;
+using utf8_to_utf16_result = std::pair<const char *, char16_t *>;
+using utf8_to_utf32_result = std::pair<const char *, uint32_t *>;
+
+/*
+    process_block_utf8_to_utf16 converts up to 64 bytes from 'in' from UTF-8
+    to UTF-16. When tail = SIMDUTF_FULL, then the full input buffer (64 bytes)
+    might be used. When tail = SIMDUTF_TAIL, we take into account 'gap' which
+    indicates how many input bytes are relevant.
+
+    Returns true when the result is correct, otherwise it returns false.
+
+    The provided in and out pointers are advanced according to how many input
+    bytes have been processed, upon success.
+*/
+template <block_processing_mode tail, endianness big_endian>
+simdutf_really_inline bool
+process_block_utf8_to_utf16(const char *&in, char16_t *&out, size_t gap) {
+  // constants
+  __m512i mask_identity = _mm512_set_epi8(
+      63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46,
+      45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28,
+      27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9,
+      8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __m512i mask_c0c0c0c0 = _mm512_set1_epi32(0xc0c0c0c0);
+  __m512i mask_80808080 = _mm512_set1_epi32(0x80808080);
+  __m512i mask_f0f0f0f0 = _mm512_set1_epi32(0xf0f0f0f0);
+  __m512i mask_dfdfdfdf_tail = _mm512_set_epi64(
+      0xffffdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf,
+      0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf,
+      0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf);
+  __m512i mask_c2c2c2c2 = _mm512_set1_epi32(0xc2c2c2c2);
+  __m512i mask_ffffffff = _mm512_set1_epi32(0xffffffff);
+  __m512i mask_d7c0d7c0 = _mm512_set1_epi32(0xd7c0d7c0);
+  __m512i mask_dc00dc00 = _mm512_set1_epi32(0xdc00dc00);
+  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  // Note that 'tail' is a compile-time constant !
+  __mmask64 b =
+      (tail == SIMDUTF_FULL) ? 0xFFFFFFFFFFFFFFFF : (uint64_t(1) << gap) - 1;
+  __m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in)
+                                         : _mm512_maskz_loadu_epi8(b, in);
+  __mmask64 m1 = (tail == SIMDUTF_FULL)
+                     ? _mm512_cmplt_epu8_mask(input, mask_80808080)
+                     : _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080);
+  if (_ktestc_mask64_u8(m1,
+                        b)) { // NOT(m1) AND b -- if all zeroes, then all ASCII
+                              // alternatively, we could do 'if (m1 == b) { '
+    if (tail == SIMDUTF_FULL) {
+      in += 64; // consumed 64 bytes
+      // we convert a full 64-byte block, writing 128 bytes.
+      __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
+      if (big_endian) {
+        input1 = _mm512_shuffle_epi8(input1, byteflip);
       }
-    }
-    uint32_t word = data[pos];
-    if ((word & 0xFFFFFF80) == 0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if ((word & 0xFFFFF800) == 0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word >> 6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if ((word & 0xFFFF0000) == 0) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-      if (word >= 0xD800 && word <= 0xDFFF) {
-        return 0;
+      _mm512_storeu_si512(out, input1);
+      out += 32;
+      __m512i input2 =
+          _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
+      if (big_endian) {
+        input2 = _mm512_shuffle_epi8(input2, byteflip);
       }
-      *utf8_output++ = char((word >> 12) | 0b11100000);
-      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
+      _mm512_storeu_si512(out, input2);
+      out += 32;
+      return true; // we are done
     } else {
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-      if (word > 0x10FFFF) {
-        return 0;
+      in += gap;
+      if (gap <= 32) {
+        __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
+        if (big_endian) {
+          input1 = _mm512_shuffle_epi8(input1, byteflip);
+        }
+        _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << (gap)) - 1),
+                                 input1);
+        out += gap;
+      } else {
+        __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
+        if (big_endian) {
+          input1 = _mm512_shuffle_epi8(input1, byteflip);
+        }
+        _mm512_storeu_si512(out, input1);
+        out += 32;
+        __m512i input2 =
+            _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
+        if (big_endian) {
+          input2 = _mm512_shuffle_epi8(input2, byteflip);
+        }
+        _mm512_mask_storeu_epi16(
+            out, __mmask32((uint32_t(1) << (gap - 32)) - 1), input2);
+        out += gap - 32;
       }
-      *utf8_output++ = char((word >> 18) | 0b11110000);
-      *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
+      return true; // we are done
     }
   }
-  return utf8_output - start;
-}
+  // classify characters further
+  __mmask64 m234 = _mm512_cmp_epu8_mask(
+      mask_c0c0c0c0, input,
+      _MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte
+  __mmask64 m34 =
+      _mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input,
+                           _MM_CMPINT_LT); // 0xdf < input,  3 or 4 leading byte
 
-inline result convert_with_errors(const char32_t *buf, size_t len,
-                                  char *utf8_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char *start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 2 ASCII characters
-    if (pos + 2 <=
-        len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
-        *utf8_output++ = char(buf[pos]);
-        *utf8_output++ = char(buf[pos + 1]);
-        pos += 2;
-        continue;
-      }
-    }
-    uint32_t word = data[pos];
-    if ((word & 0xFFFFFF80) == 0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if ((word & 0xFFFFF800) == 0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word >> 6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if ((word & 0xFFFF0000) == 0) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-      if (word >= 0xD800 && word <= 0xDFFF) {
-        return result(error_code::SURROGATE, pos);
+  __mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask(
+      m234, input, mask_c2c2c2c2,
+      _MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence)
+                      // Overlong 2-byte sequence
+  if (_ktestz_mask64_u8(milltwobytes, milltwobytes) == 0) {
+    // Overlong 2-byte sequence
+    return false;
+  }
+  if (_ktestz_mask64_u8(m34, m34) == 0) {
+    // We have a 3-byte sequence and/or a 2-byte sequence, or possibly even a
+    // 4-byte sequence!
+    __mmask64 m4 = _mm512_cmp_epu8_mask(
+        input, mask_f0f0f0f0,
+        _MM_CMPINT_NLT); // 0xf0 <= zmm0 (4 byte start bytes)
+
+    __mmask64 mask_not_ascii = (tail == SIMDUTF_FULL)
+                                   ? _knot_mask64(m1)
+                                   : _kand_mask64(_knot_mask64(m1), b);
+
+    __mmask64 mp1 = _kshiftli_mask64(m234, 1);
+    __mmask64 mp2 = _kshiftli_mask64(m34, 2);
+    // We could do it as follows...
+    // if (_kortestz_mask64_u8(m4,m4)) { // compute the bitwise OR of the 64-bit
+    // masks a and b and return 1 if all zeroes but GCC generates better code
+    // when we do:
+    if (m4 == 0) { // compute the bitwise OR of the 64-bit masks a and b and
+                   // return 1 if all zeroes
+      // Fast path with 1,2,3 bytes
+      __mmask64 mc = _kor_mask64(mp1, mp2); // expected continuation bytes
+      __mmask64 m1234 = _kor_mask64(m1, m234);
+      // mismatched continuation bytes:
+      if (tail == SIMDUTF_FULL) {
+        __mmask64 xnormcm1234 = _kxnor_mask64(
+            mc,
+            m1234); // XNOR of mc and m1234 should be all zero if they differ
+        // the presence of a 1 bit indicates that they overlap.
+        // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return
+        // 1 if all zeroes.
+        if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) {
+          return false;
+        }
+      } else {
+        __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
+        if (mc != bxorm1234) {
+          return false;
+        }
       }
-      *utf8_output++ = char((word >> 12) | 0b11100000);
-      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else {
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-      if (word > 0x10FFFF) {
-        return result(error_code::TOO_LARGE, pos);
+      // mend: identifying the last bytes of each sequence to be decoded
+      __mmask64 mend = _kshiftri_mask64(m1234, 1);
+      if (tail != SIMDUTF_FULL) {
+        mend = _kor_mask64(mend, (uint64_t(1) << (gap - 1)));
       }
-      *utf8_output++ = char((word >> 18) | 0b11110000);
-      *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    }
-  }
-  return result(error_code::SUCCESS, utf8_output - start);
-}
 
-} // namespace utf32_to_utf8
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+      __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
+      __m512i last_and_thirdu16 =
+          _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
 
-#endif
-/* end file src/scalar/utf32_to_utf8/utf32_to_utf8.h */
+      __m512i nonasciitags = _mm512_maskz_mov_epi8(
+          mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
+      __m512i clearedbytes = _mm512_andnot_si512(
+          nonasciitags, input); // high two bits cleared where not ASCII
+      __m512i lastbytes = _mm512_maskz_permutexvar_epi8(
+          0x5555555555555555, last_and_thirdu16,
+          clearedbytes); // the last byte of each character
 
-/* begin file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */
-#ifndef SIMDUTF_VALID_UTF32_TO_UTF16_H
-#define SIMDUTF_VALID_UTF32_TO_UTF16_H
+      __mmask64 mask_before_non_ascii = _kshiftri_mask64(
+          mask_not_ascii, 1); // bytes that precede non-ASCII bytes
+      __m512i indexofsecondlastbytes = _mm512_add_epi16(
+          mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
+      __m512i beforeasciibytes =
+          _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
+      __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(
+          0x5555555555555555, indexofsecondlastbytes,
+          beforeasciibytes); // the second last bytes (of two, three byte seq,
+                             // surrogates)
+      secondlastbytes =
+          _mm512_slli_epi16(secondlastbytes, 6); // shifted into position
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf32_to_utf16 {
+      __m512i indexofthirdlastbytes = _mm512_add_epi16(
+          mask_ffffffff,
+          indexofsecondlastbytes); // indices of the second last bytes
+      __m512i thirdlastbyte =
+          _mm512_maskz_mov_epi8(m34,
+                                clearedbytes); // only those that are the third
+                                               // last byte of a sequence
+      __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(
+          0x5555555555555555, indexofthirdlastbytes,
+          thirdlastbyte); // the third last bytes (of three byte sequences, hi
+                          // surrogate)
+      thirdlastbytes =
+          _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position
+      __m512i Wout = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes,
+                                               thirdlastbytes, 254);
+      // the elements of Wout excluding the last element if it happens to be a
+      // high surrogate:
 
-template <endianness big_endian>
-inline size_t convert_valid(const char32_t *buf, size_t len,
-                            char16_t *utf16_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char16_t *start{utf16_output};
-  while (pos < len) {
-    uint32_t word = data[pos];
-    if ((word & 0xFFFF0000) == 0) {
-      // will not generate a surrogate pair
-      *utf16_output++ = !match_system(big_endian)
-                            ? char16_t(utf16::swap_bytes(uint16_t(word)))
-                            : char16_t(word);
-      pos++;
-    } else {
-      // will generate a surrogate pair
-      word -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
+      __mmask64 mprocessed =
+          (tail == SIMDUTF_FULL)
+              ? _pdep_u64(0xFFFFFFFF, mend)
+              : _pdep_u64(
+                    0xFFFFFFFF,
+                    _kand_mask64(
+                        mend, b)); // we adjust mend at the end of the output.
+
+      // Encodings out of range...
+      {
+        // the location of 3-byte sequence start bytes in the input
+        __mmask64 m3 = m34 & (b ^ m4);
+        // code units in Wout corresponding to 3-byte sequences.
+        __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
+        __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
+        __mmask32 Msmall800 =
+            _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
+        __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
+        __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
+        __mmask32 M3s =
+            _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
+        if (_kor_mask32(Msmall800, M3s)) {
+          return false;
+        }
       }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
-      pos++;
+      int64_t nout = _mm_popcnt_u64(mprocessed);
+      in += 64 - _lzcnt_u64(mprocessed);
+      if (big_endian) {
+        Wout = _mm512_shuffle_epi8(Wout, byteflip);
+      }
+      _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
+      out += nout;
+      return true; // ok
     }
-  }
-  return utf16_output - start;
-}
+    //
+    // We have a 4-byte sequence, this is the general case.
+    // Slow!
+    __mmask64 mp3 = _kshiftli_mask64(m4, 3);
+    __mmask64 mc =
+        _kor_mask64(_kor_mask64(mp1, mp2), mp3); // expected continuation bytes
+    __mmask64 m1234 = _kor_mask64(m1, m234);
 
-} // namespace utf32_to_utf16
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+    // mend: identifying the last bytes of each sequence to be decoded
+    __mmask64 mend =
+        _kor_mask64(_kshiftri_mask64(_kor_mask64(mp3, m1234), 1), mp3);
+    if (tail != SIMDUTF_FULL) {
+      mend = _kor_mask64(mend, __mmask64(uint64_t(1) << (gap - 1)));
+    }
+    __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
+    __m512i last_and_thirdu16 =
+        _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
 
-#endif
-/* end file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */
-/* begin file src/scalar/utf32_to_utf16/utf32_to_utf16.h */
-#ifndef SIMDUTF_UTF32_TO_UTF16_H
-#define SIMDUTF_UTF32_TO_UTF16_H
+    __m512i nonasciitags = _mm512_maskz_mov_epi8(
+        mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
+    __m512i clearedbytes = _mm512_andnot_si512(
+        nonasciitags, input); // high two bits cleared where not ASCII
+    __m512i lastbytes = _mm512_maskz_permutexvar_epi8(
+        0x5555555555555555, last_and_thirdu16,
+        clearedbytes); // the last byte of each character
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf32_to_utf16 {
+    __mmask64 mask_before_non_ascii = _kshiftri_mask64(
+        mask_not_ascii, 1); // bytes that precede non-ASCII bytes
+    __m512i indexofsecondlastbytes = _mm512_add_epi16(
+        mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
+    __m512i beforeasciibytes =
+        _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
+    __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(
+        0x5555555555555555, indexofsecondlastbytes,
+        beforeasciibytes); // the second last bytes (of two, three byte seq,
+                           // surrogates)
+    secondlastbytes =
+        _mm512_slli_epi16(secondlastbytes, 6); // shifted into position
 
-template <endianness big_endian>
-inline size_t convert(const char32_t *buf, size_t len, char16_t *utf16_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char16_t *start{utf16_output};
-  while (pos < len) {
-    uint32_t word = data[pos];
-    if ((word & 0xFFFF0000) == 0) {
-      if (word >= 0xD800 && word <= 0xDFFF) {
-        return 0;
+    __m512i indexofthirdlastbytes = _mm512_add_epi16(
+        mask_ffffffff,
+        indexofsecondlastbytes); // indices of the second last bytes
+    __m512i thirdlastbyte = _mm512_maskz_mov_epi8(
+        m34,
+        clearedbytes); // only those that are the third last byte of a sequence
+    __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(
+        0x5555555555555555, indexofthirdlastbytes,
+        thirdlastbyte); // the third last bytes (of three byte sequences, hi
+                        // surrogate)
+    thirdlastbytes =
+        _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position
+    __m512i thirdsecondandlastbytes = _mm512_ternarylogic_epi32(
+        lastbytes, secondlastbytes, thirdlastbytes, 254);
+    uint64_t Mlo_uint64 = _pext_u64(mp3, mend);
+    __mmask32 Mlo = __mmask32(Mlo_uint64);
+    __mmask32 Mhi = __mmask32(Mlo_uint64 >> 1);
+    __m512i lo_surr_mask = _mm512_maskz_mov_epi16(
+        Mlo,
+        mask_dc00dc00); // lo surr: 1101110000000000, other:  0000000000000000
+    __m512i shifted4_thirdsecondandlastbytes =
+        _mm512_srli_epi16(thirdsecondandlastbytes,
+                          4); // hi surr: 00000WVUTSRQPNML  vuts = WVUTS - 1
+    __m512i tagged_lo_surrogates = _mm512_or_si512(
+        thirdsecondandlastbytes,
+        lo_surr_mask); // lo surr: 110111KJHGFEDCBA, other:  unchanged
+    __m512i Wout = _mm512_mask_add_epi16(
+        tagged_lo_surrogates, Mhi, shifted4_thirdsecondandlastbytes,
+        mask_d7c0d7c0); // hi sur: 110110vutsRQPNML, other:  unchanged
+    // the elements of Wout excluding the last element if it happens to be a
+    // high surrogate:
+    __mmask32 Mout = ~(Mhi & 0x80000000);
+    __mmask64 mprocessed =
+        (tail == SIMDUTF_FULL)
+            ? _pdep_u64(Mout, mend)
+            : _pdep_u64(
+                  Mout,
+                  _kand_mask64(mend,
+                               b)); // we adjust mend at the end of the output.
+
+    // mismatched continuation bytes:
+    if (tail == SIMDUTF_FULL) {
+      __mmask64 xnormcm1234 = _kxnor_mask64(
+          mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
+      // the presence of a 1 bit indicates that they overlap.
+      // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1
+      // if all zeroes.
+      if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) {
+        return false;
       }
-      // will not generate a surrogate pair
-      *utf16_output++ = !match_system(big_endian)
-                            ? char16_t(utf16::swap_bytes(uint16_t(word)))
-                            : char16_t(word);
     } else {
-      // will generate a surrogate pair
-      if (word > 0x10FFFF) {
-        return 0;
+      __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
+      if (mc != bxorm1234) {
+        return false;
       }
-      word -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
+    }
+    // Encodings out of range...
+    {
+      // the location of 3-byte sequence start bytes in the input
+      __mmask64 m3 = m34 & (b ^ m4);
+      // code units in Wout corresponding to 3-byte sequences.
+      __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
+      __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
+      __mmask32 Msmall800 =
+          _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
+      __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
+      __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
+      __mmask32 M3s =
+          _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
+      __m512i mask_04000400 = _mm512_set1_epi32(0x04000400);
+      __mmask32 M4s =
+          _mm512_mask_cmpge_epu16_mask(Mhi, Moutminusd800, mask_04000400);
+      if (!_kortestz_mask32_u8(M4s, _kor_mask32(Msmall800, M3s))) {
+        return false;
       }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
     }
-    pos++;
+    in += 64 - _lzcnt_u64(mprocessed);
+    int64_t nout = _mm_popcnt_u64(mprocessed);
+    if (big_endian) {
+      Wout = _mm512_shuffle_epi8(Wout, byteflip);
+    }
+    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
+    out += nout;
+    return true; // ok
   }
-  return utf16_output - start;
-}
-
-template <endianness big_endian>
-inline result convert_with_errors(const char32_t *buf, size_t len,
-                                  char16_t *utf16_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char16_t *start{utf16_output};
-  while (pos < len) {
-    uint32_t word = data[pos];
-    if ((word & 0xFFFF0000) == 0) {
-      if (word >= 0xD800 && word <= 0xDFFF) {
-        return result(error_code::SURROGATE, pos);
-      }
-      // will not generate a surrogate pair
-      *utf16_output++ = !match_system(big_endian)
-                            ? char16_t(utf16::swap_bytes(uint16_t(word)))
-                            : char16_t(word);
-    } else {
-      // will generate a surrogate pair
-      if (word > 0x10FFFF) {
-        return result(error_code::TOO_LARGE, pos);
-      }
-      word -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
-      }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
+  // Fast path 2: all ASCII or 2 byte
+  __mmask64 continuation_or_ascii = (tail == SIMDUTF_FULL)
+                                        ? _knot_mask64(m234)
+                                        : _kand_mask64(_knot_mask64(m234), b);
+  // on top of -0xc0 we subtract -2 which we get back later of the
+  // continuation byte tags
+  __m512i leading2byte = _mm512_maskz_sub_epi8(m234, input, mask_c2c2c2c2);
+  __mmask64 leading = tail == (tail == SIMDUTF_FULL)
+                          ? _kor_mask64(m1, m234)
+                          : _kand_mask64(_kor_mask64(m1, m234),
+                                         b); // first bytes of each sequence
+  if (tail == SIMDUTF_FULL) {
+    __mmask64 xnor234leading =
+        _kxnor_mask64(_kshiftli_mask64(m234, 1), leading);
+    if (!_kortestz_mask64_u8(xnor234leading, xnor234leading)) {
+      return false;
+    }
+  } else {
+    __mmask64 bxorleading = _kxor_mask64(b, leading);
+    if (_kshiftli_mask64(m234, 1) != bxorleading) {
+      return false;
     }
-    pos++;
   }
-  return result(error_code::SUCCESS, utf16_output - start);
-}
+  //
+  if (tail == SIMDUTF_FULL) {
+    // In the two-byte/ASCII scenario, we are easily latency bound, so we want
+    // to increment the input buffer as quickly as possible.
+    // We process 32 bytes unless the byte at index 32 is a continuation byte,
+    // in which case we include it as well for a total of 33 bytes.
+    // Note that if x is an ASCII byte, then the following is false:
+    // int8_t(x) <= int8_t(0xc0) under two's complement.
+    in += 32;
+    if (int8_t(*in) <= int8_t(0xc0))
+      in++;
+    // The alternative is to do
+    // in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
+    // but it requires loading the input, doing the mask computation, and
+    // converting back the mask to a general register. It just takes too long,
+    // leaving the processor likely to be idle.
+  } else {
+    in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
+  }
+  __m512i lead = _mm512_maskz_compress_epi8(
+      leading, leading2byte); // will contain zero for ascii, and the data
+  lead = _mm512_cvtepu8_epi16(
+      _mm512_castsi512_si256(lead)); // ... zero extended into code units
+  __m512i follow = _mm512_maskz_compress_epi8(
+      continuation_or_ascii, input); // the last bytes of each sequence
+  follow = _mm512_cvtepu8_epi16(
+      _mm512_castsi512_si256(follow)); // ... zero extended into code units
+  lead = _mm512_slli_epi16(lead, 6);   // shifted into position
+  __m512i final = _mm512_add_epi16(follow, lead); // combining lead and follow
 
-} // namespace utf32_to_utf16
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+  if (big_endian) {
+    final = _mm512_shuffle_epi8(final, byteflip);
+  }
+  if (tail == SIMDUTF_FULL) {
+    // Next part is UTF-16 specific and can be generalized to UTF-32.
+    int nout = _mm_popcnt_u32(uint32_t(leading));
+    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
+    out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
+  } else {
+    int nout = int(_mm_popcnt_u64(_pdep_u64(0xFFFFFFFF, leading)));
+    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
+    out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
+  }
 
-#endif
-/* end file src/scalar/utf32_to_utf16/utf32_to_utf16.h */
+  return true; // we are fine.
+}
 
-/* begin file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
-#ifndef SIMDUTF_VALID_UTF16_TO_UTF8_H
-#define SIMDUTF_VALID_UTF16_TO_UTF8_H
+/*
+    utf32_to_utf16_masked converts `count` lower UTF-32 code units
+    from input `utf32` into UTF-16. It differs from utf32_to_utf16
+    in that it 'masks' the writes.
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf16_to_utf8 {
+    Returns how many 16-bit code units were stored.
 
+    byteflip is used for flipping 16-bit code units, and it should be
+        __m512i byteflip = _mm512_setr_epi64(
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809
+        );
+    We pass it to the (always inlined) function to encourage the compiler to
+    keep the value in a (constant) register.
+*/
 template <endianness big_endian>
-inline size_t convert_valid(const char16_t *buf, size_t len,
-                            char *utf8_output) {
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char *start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 4 ASCII characters
-    if (pos + 4 <=
-        len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if (!match_system(big_endian)) {
-        v = (v >> 8) | (v << (64 - 8));
-      }
-      if ((v & 0xFF80FF80FF80FF80) == 0) {
-        size_t final_pos = pos + 4;
-        while (pos < final_pos) {
-          *utf8_output++ = !match_system(big_endian)
-                               ? char(utf16::swap_bytes(buf[pos]))
-                               : char(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
+simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip,
+                                                   __m512i utf32,
+                                                   unsigned int count,
+                                                   char16_t *output) {
+
+  const __mmask16 valid = uint16_t((1 << count) - 1);
+  // 1. check if we have any surrogate pairs
+  const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
+  const __mmask16 sp_mask =
+      _mm512_mask_cmpgt_epu32_mask(valid, utf32, v_0000_ffff);
+
+  if (sp_mask == 0) {
+    if (big_endian) {
+      _mm256_mask_storeu_epi16(
+          (__m256i *)output, valid,
+          _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),
+                              _mm512_castsi512_si256(byteflip)));
 
-    uint16_t word =
-        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if ((word & 0xFF80) == 0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if ((word & 0xF800) == 0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word >> 6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if ((word & 0xF800) != 0xD800) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((word >> 12) | 0b11100000);
-      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
     } else {
-      // must be a surrogate pair
-      uint16_t diff = uint16_t(word - 0xD800);
-      if (pos + 1 >= len) {
-        return 0;
-      } // minimal bound checking
-      uint16_t next_word = !match_system(big_endian)
-                               ? utf16::swap_bytes(data[pos + 1])
-                               : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((value >> 18) | 0b11110000);
-      *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((value & 0b111111) | 0b10000000);
-      pos += 2;
+      _mm256_mask_storeu_epi16((__m256i *)output, valid,
+                               _mm512_cvtepi32_epi16(utf32));
     }
+    return count;
   }
-  return utf8_output - start;
-}
 
-} // namespace utf16_to_utf8
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+  {
+    // build surrogate pair code units in 32-bit lanes
 
-#endif
-/* end file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
-/* begin file src/scalar/utf16_to_utf8/utf16_to_utf8.h */
-#ifndef SIMDUTF_UTF16_TO_UTF8_H
-#define SIMDUTF_UTF16_TO_UTF8_H
+    //    t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb]
+    const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000);
+    const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000);
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf16_to_utf8 {
+    //    t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000]
+    const __m512i t1 = _mm512_slli_epi32(t0, 6);
 
-template <endianness big_endian>
-inline size_t convert(const char16_t *buf, size_t len, char *utf8_output) {
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char *start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 8 bytes
-    if (pos + 4 <=
-        len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if (!match_system(big_endian)) {
-        v = (v >> 8) | (v << (64 - 8));
-      }
-      if ((v & 0xFF80FF80FF80FF80) == 0) {
-        size_t final_pos = pos + 4;
-        while (pos < final_pos) {
-          *utf8_output++ = !match_system(big_endian)
-                               ? char(utf16::swap_bytes(buf[pos]))
-                               : char(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint16_t word =
-        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if ((word & 0xFF80) == 0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if ((word & 0xF800) == 0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word >> 6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if ((word & 0xF800) != 0xD800) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((word >> 12) | 0b11100000);
-      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else {
-      // must be a surrogate pair
-      if (pos + 1 >= len) {
-        return 0;
-      }
-      uint16_t diff = uint16_t(word - 0xD800);
-      if (diff > 0x3FF) {
-        return 0;
-      }
-      uint16_t next_word = !match_system(big_endian)
-                               ? utf16::swap_bytes(data[pos + 1])
-                               : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      if (diff2 > 0x3FF) {
-        return 0;
-      }
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((value >> 18) | 0b11110000);
-      *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((value & 0b111111) | 0b10000000);
-      pos += 2;
-    }
-  }
-  return utf8_output - start;
-}
+    //    t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1
+    //    to t0
+    //         0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000)
+    const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000);
+    const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4);
 
-template <endianness big_endian>
-inline result convert_with_errors(const char16_t *buf, size_t len,
-                                  char *utf8_output) {
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char *start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 8 bytes
-    if (pos + 4 <=
-        len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if (!match_system(big_endian))
-        v = (v >> 8) | (v << (64 - 8));
-      if ((v & 0xFF80FF80FF80FF80) == 0) {
-        size_t final_pos = pos + 4;
-        while (pos < final_pos) {
-          *utf8_output++ = !match_system(big_endian)
-                               ? char(utf16::swap_bytes(buf[pos]))
-                               : char(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint16_t word =
-        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if ((word & 0xFF80) == 0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if ((word & 0xF800) == 0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word >> 6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if ((word & 0xF800) != 0xD800) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((word >> 12) | 0b11100000);
-      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else {
-      // must be a surrogate pair
-      if (pos + 1 >= len) {
-        return result(error_code::SURROGATE, pos);
-      }
-      uint16_t diff = uint16_t(word - 0xD800);
-      if (diff > 0x3FF) {
-        return result(error_code::SURROGATE, pos);
-      }
-      uint16_t next_word = !match_system(big_endian)
-                               ? utf16::swap_bytes(data[pos + 1])
-                               : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      if (diff2 > 0x3FF) {
-        return result(error_code::SURROGATE, pos);
-      }
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((value >> 18) | 0b11110000);
-      *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((value & 0b111111) | 0b10000000);
-      pos += 2;
+    //    t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1
+    //    to t0
+    //         0xba = (t2 and not v_fc00_fc000) or v_d800_dc00
+    const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00);
+    const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00);
+    const __m512i t3 =
+        _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
+    const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
+    __m512i t5 = _mm512_ror_epi32(t4, 16);
+    // Here we want to trim all of the upper 16-bit code units from the 2-byte
+    // characters represented as 4-byte values. We can compute it from
+    // sp_mask or the following... It can be more optimized!
+    const __mmask32 nonzero = _kor_mask32(
+        0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
+    const __mmask32 nonzero_masked =
+        _kand_mask32(nonzero, __mmask32((uint64_t(1) << (2 * count)) - 1));
+    if (big_endian) {
+      t5 = _mm512_shuffle_epi8(t5, byteflip);
     }
+    // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability
+    // (zen4)
+    __m512i compressed = _mm512_maskz_compress_epi16(nonzero_masked, t5);
+    _mm512_mask_storeu_epi16(
+        output,
+        (1 << (count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1,
+        compressed);
+    //_mm512_mask_compressstoreu_epi16(output, nonzero_masked, t5);
   }
-  return result(error_code::SUCCESS, utf8_output - start);
-}
-
-} // namespace utf16_to_utf8
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
 
-#endif
-/* end file src/scalar/utf16_to_utf8/utf16_to_utf8.h */
+  return count + static_cast<unsigned int>(count_ones(sp_mask));
+}
 
-/* begin file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */
-#ifndef SIMDUTF_VALID_UTF16_TO_UTF32_H
-#define SIMDUTF_VALID_UTF16_TO_UTF32_H
+/*
+    utf32_to_utf16 converts `count` lower UTF-32 code units
+    from input `utf32` into UTF-16. It may overflow.
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf16_to_utf32 {
+    Returns how many 16-bit code units were stored.
 
+    byteflip is used for flipping 16-bit code units, and it should be
+        __m512i byteflip = _mm512_setr_epi64(
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809
+        );
+    We pass it to the (always inlined) function to encourage the compiler to
+    keep the value in a (constant) register.
+*/
 template <endianness big_endian>
-inline size_t convert_valid(const char16_t *buf, size_t len,
-                            char32_t *utf32_output) {
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char32_t *start{utf32_output};
-  while (pos < len) {
-    uint16_t word =
-        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if ((word & 0xF800) != 0xD800) {
-      // No surrogate pair, extend 16-bit word to 32-bit word
-      *utf32_output++ = char32_t(word);
-      pos++;
+simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip,
+                                            __m512i utf32, unsigned int count,
+                                            char16_t *output) {
+  // check if we have any surrogate pairs
+  const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
+  const __mmask16 sp_mask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
+
+  if (sp_mask == 0) {
+    // technically, it should be _mm256_storeu_epi16
+    if (big_endian) {
+      _mm256_storeu_si256(
+          (__m256i *)output,
+          _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),
+                              _mm512_castsi512_si256(byteflip)));
     } else {
-      // must be a surrogate pair
-      uint16_t diff = uint16_t(word - 0xD800);
-      if (pos + 1 >= len) {
-        return 0;
-      } // minimal bound checking
-      uint16_t next_word = !match_system(big_endian)
-                               ? utf16::swap_bytes(data[pos + 1])
-                               : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      *utf32_output++ = char32_t(value);
-      pos += 2;
+      _mm256_storeu_si256((__m256i *)output, _mm512_cvtepi32_epi16(utf32));
     }
+    return count;
   }
-  return utf32_output - start;
-}
 
-} // namespace utf16_to_utf32
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+  {
+    // build surrogate pair code units in 32-bit lanes
 
-#endif
-/* end file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */
-/* begin file src/scalar/utf16_to_utf32/utf16_to_utf32.h */
-#ifndef SIMDUTF_UTF16_TO_UTF32_H
-#define SIMDUTF_UTF16_TO_UTF32_H
+    //    t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb]
+    const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000);
+    const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000);
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf16_to_utf32 {
+    //    t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000]
+    const __m512i t1 = _mm512_slli_epi32(t0, 6);
 
-template <endianness big_endian>
-inline size_t convert(const char16_t *buf, size_t len, char32_t *utf32_output) {
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char32_t *start{utf32_output};
-  while (pos < len) {
-    uint16_t word =
-        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if ((word & 0xF800) != 0xD800) {
-      // No surrogate pair, extend 16-bit word to 32-bit word
-      *utf32_output++ = char32_t(word);
-      pos++;
-    } else {
-      // must be a surrogate pair
-      uint16_t diff = uint16_t(word - 0xD800);
-      if (diff > 0x3FF) {
-        return 0;
-      }
-      if (pos + 1 >= len) {
-        return 0;
-      } // minimal bound checking
-      uint16_t next_word = !match_system(big_endian)
-                               ? utf16::swap_bytes(data[pos + 1])
-                               : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      if (diff2 > 0x3FF) {
-        return 0;
-      }
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      *utf32_output++ = char32_t(value);
-      pos += 2;
+    //    t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1
+    //    to t0
+    //         0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000)
+    const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000);
+    const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4);
+
+    //    t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1
+    //    to t0
+    //         0xba = (t2 and not v_fc00_fc000) or v_d800_dc00
+    const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00);
+    const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00);
+    const __m512i t3 =
+        _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
+    const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
+    __m512i t5 = _mm512_ror_epi32(t4, 16);
+    const __mmask32 nonzero = _kor_mask32(
+        0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
+    if (big_endian) {
+      t5 = _mm512_shuffle_epi8(t5, byteflip);
     }
+    // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability
+    // (zen4)
+    __m512i compressed = _mm512_maskz_compress_epi16(nonzero, t5);
+    _mm512_mask_storeu_epi16(
+        output,
+        (1 << (count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1,
+        compressed);
+    //_mm512_mask_compressstoreu_epi16(output, nonzero, t5);
   }
-  return utf32_output - start;
+
+  return count + static_cast<unsigned int>(count_ones(sp_mask));
 }
 
-template <endianness big_endian>
-inline result convert_with_errors(const char16_t *buf, size_t len,
-                                  char32_t *utf32_output) {
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char32_t *start{utf32_output};
-  while (pos < len) {
-    uint16_t word =
-        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if ((word & 0xF800) != 0xD800) {
-      // No surrogate pair, extend 16-bit word to 32-bit word
-      *utf32_output++ = char32_t(word);
-      pos++;
-    } else {
-      // must be a surrogate pair
-      uint16_t diff = uint16_t(word - 0xD800);
-      if (diff > 0x3FF) {
-        return result(error_code::SURROGATE, pos);
-      }
-      if (pos + 1 >= len) {
-        return result(error_code::SURROGATE, pos);
-      } // minimal bound checking
-      uint16_t next_word = !match_system(big_endian)
-                               ? utf16::swap_bytes(data[pos + 1])
-                               : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      if (diff2 > 0x3FF) {
-        return result(error_code::SURROGATE, pos);
-      }
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      *utf32_output++ = char32_t(value);
-      pos += 2;
-    }
+/**
+ * Store the last N bytes of previous followed by 512-N bytes from input.
+ */
+template <int N> __m512i prev(__m512i input, __m512i previous) {
+  static_assert(N <= 32, "N must be no larger than 32");
+  const __m512i movemask =
+      _mm512_setr_epi32(28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
+  const __m512i rotated = _mm512_permutex2var_epi32(input, movemask, previous);
+#if SIMDUTF_GCC8 || SIMDUTF_GCC9
+  constexpr int shift = 16 - N; // workaround for GCC8,9
+  return _mm512_alignr_epi8(input, rotated, shift);
+#else
+  return _mm512_alignr_epi8(input, rotated, 16 - N);
+#endif // SIMDUTF_GCC8 || SIMDUTF_GCC9
+}
+
+template <unsigned idx0, unsigned idx1, unsigned idx2, unsigned idx3>
+__m512i shuffle_epi128(__m512i v) {
+  static_assert((idx0 >= 0 && idx0 <= 3), "idx0 must be in range 0..3");
+  static_assert((idx1 >= 0 && idx1 <= 3), "idx1 must be in range 0..3");
+  static_assert((idx2 >= 0 && idx2 <= 3), "idx2 must be in range 0..3");
+  static_assert((idx3 >= 0 && idx3 <= 3), "idx3 must be in range 0..3");
+
+  constexpr unsigned shuffle = idx0 | (idx1 << 2) | (idx2 << 4) | (idx3 << 6);
+  return _mm512_shuffle_i32x4(v, v, shuffle);
+}
+
+template <unsigned idx> constexpr __m512i broadcast_epi128(__m512i v) {
+  return shuffle_epi128<idx, idx, idx, idx>(v);
+}
+
+/**
+ * Current unused.
+ */
+template <int N> __m512i rotate_by_N_epi8(const __m512i input) {
+
+  // lanes order: 1, 2, 3, 0 => 0b00_11_10_01
+  const __m512i permuted = _mm512_shuffle_i32x4(input, input, 0x39);
+
+  return _mm512_alignr_epi8(permuted, input, N);
+}
+
+/*
+    expanded_utf8_to_utf32 converts expanded UTF-8 characters (`utf8`)
+    stored at separate 32-bit lanes.
+
+    For each lane we have also a character class (`char_class), given in form
+    0x8080800N, where N is 4 highest bits from the leading byte; 0x80 resets
+    corresponding bytes during pshufb.
+*/
+simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class,
+                                                     __m512i utf8) {
+  /*
+      Input:
+      - utf8: bytes stored at separate 32-bit code units
+      - valid: which code units have valid UTF-8 characters
+
+      Bit layout of single word. We show 4 cases for each possible
+      UTF-8 character encoding. The `?` denotes bits we must not
+      assume their value.
+
+      |10dd.dddd|10cc.cccc|10bb.bbbb|1111.0aaa| 4-byte char
+      |????.????|10cc.cccc|10bb.bbbb|1110.aaaa| 3-byte char
+      |????.????|????.????|10bb.bbbb|110a.aaaa| 2-byte char
+      |????.????|????.????|????.????|0aaa.aaaa| ASCII char
+        byte 3    byte 2    byte 1     byte 0
+  */
+
+  /* 1. Reset control bits of continuation bytes and the MSB
+        of the leading byte; this makes all bytes unsigned (and
+        does not alter ASCII char).
+
+      |00dd.dddd|00cc.cccc|00bb.bbbb|0111.0aaa| 4-byte char
+      |00??.????|00cc.cccc|00bb.bbbb|0110.aaaa| 3-byte char
+      |00??.????|00??.????|00bb.bbbb|010a.aaaa| 2-byte char
+      |00??.????|00??.????|00??.????|0aaa.aaaa| ASCII char
+       ^^        ^^        ^^        ^
+  */
+  __m512i values;
+  const __m512i v_3f3f_3f7f = _mm512_set1_epi32(0x3f3f3f7f);
+  values = _mm512_and_si512(utf8, v_3f3f_3f7f);
+
+  /* 2. Swap and join fields A-B and C-D
+
+      |0000.cccc|ccdd.dddd|0001.110a|aabb.bbbb| 4-byte char
+      |0000.cccc|cc??.????|0001.10aa|aabb.bbbb| 3-byte char
+      |0000.????|????.????|0001.0aaa|aabb.bbbb| 2-byte char
+      |0000.????|????.????|000a.aaaa|aa??.????| ASCII char */
+  const __m512i v_0140_0140 = _mm512_set1_epi32(0x01400140);
+  values = _mm512_maddubs_epi16(values, v_0140_0140);
+
+  /* 3. Swap and join fields AB & CD
+
+      |0000.0001|110a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char
+      |0000.0001|10aa.aabb|bbbb.cccc|cc??.????| 3-byte char
+      |0000.0001|0aaa.aabb|bbbb.????|????.????| 2-byte char
+      |0000.000a|aaaa.aa??|????.????|????.????| ASCII char */
+  const __m512i v_0001_1000 = _mm512_set1_epi32(0x00011000);
+  values = _mm512_madd_epi16(values, v_0001_1000);
+
+  /* 4. Shift left the values by variable amounts to reset highest UTF-8 bits
+      |aaab.bbbb|bccc.cccd|dddd.d000|0000.0000| 4-byte char -- by 11
+      |aaaa.bbbb|bbcc.cccc|????.??00|0000.0000| 3-byte char -- by 10
+      |aaaa.abbb|bbb?.????|????.???0|0000.0000| 2-byte char -- by 9
+      |aaaa.aaa?|????.????|????.????|?000.0000| ASCII char -- by 7 */
+  {
+    /** pshufb
+
+    continuation = 0
+    ascii    = 7
+    _2_bytes = 9
+    _3_bytes = 10
+    _4_bytes = 11
+
+    shift_left_v3 = 4 * [
+        ascii, # 0000
+        ascii, # 0001
+        ascii, # 0010
+        ascii, # 0011
+        ascii, # 0100
+        ascii, # 0101
+        ascii, # 0110
+        ascii, # 0111
+        continuation, # 1000
+        continuation, # 1001
+        continuation, # 1010
+        continuation, # 1011
+        _2_bytes, # 1100
+        _2_bytes, # 1101
+        _3_bytes, # 1110
+        _4_bytes, # 1111
+    ] */
+    const __m512i shift_left_v3 = _mm512_setr_epi64(
+        0x0707070707070707, 0x0b0a090900000000, 0x0707070707070707,
+        0x0b0a090900000000, 0x0707070707070707, 0x0b0a090900000000,
+        0x0707070707070707, 0x0b0a090900000000);
+
+    const __m512i shift = _mm512_shuffle_epi8(shift_left_v3, char_class);
+    values = _mm512_sllv_epi32(values, shift);
   }
-  return result(error_code::SUCCESS, utf32_output - start);
+
+  /* 5. Shift right the values by variable amounts to reset lowest bits
+      |0000.0000|000a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char -- by 11
+      |0000.0000|0000.0000|aaaa.bbbb|bbcc.cccc| 3-byte char -- by 16
+      |0000.0000|0000.0000|0000.0aaa|aabb.bbbb| 2-byte char -- by 21
+      |0000.0000|0000.0000|0000.0000|0aaa.aaaa| ASCII char -- by 25 */
+  {
+    // 4 * [25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 21, 21, 16, 11]
+    const __m512i shift_right = _mm512_setr_epi64(
+        0x1919191919191919, 0x0b10151500000000, 0x1919191919191919,
+        0x0b10151500000000, 0x1919191919191919, 0x0b10151500000000,
+        0x1919191919191919, 0x0b10151500000000);
+
+    const __m512i shift = _mm512_shuffle_epi8(shift_right, char_class);
+    values = _mm512_srlv_epi32(values, shift);
+  }
+
+  return values;
 }
 
-} // namespace utf16_to_utf32
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+simdutf_really_inline __m512i expand_and_identify(__m512i lane0, __m512i lane1,
+                                                  int &count) {
+  const __m512i merged = _mm512_mask_mov_epi32(lane0, 0x1000, lane1);
+  const __m512i expand_ver2 = _mm512_setr_epi64(
+      0x0403020103020100, 0x0605040305040302, 0x0807060507060504,
+      0x0a09080709080706, 0x0c0b0a090b0a0908, 0x0e0d0c0b0d0c0b0a,
+      0x000f0e0d0f0e0d0c, 0x0201000f01000f0e);
+  const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);
+  const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);
+  const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);
+  const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);
+  const __mmask16 leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);
+  count = static_cast<int>(count_ones(leading_bytes));
+  return _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes,
+                                    input);
+}
 
-#endif
-/* end file src/scalar/utf16_to_utf32/utf16_to_utf32.h */
+simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) {
+  __m512i char_class = _mm512_srli_epi32(input, 4);
+  /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */
+  const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);
+  const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);
+  char_class =
+      _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea);
+  return expanded_utf8_to_utf32(char_class, input);
+}
+/* end file src/icelake/icelake_utf8_common.inl.cpp */
+/* begin file src/icelake/icelake_macros.inl.cpp */
 
-/* begin file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
-#ifndef SIMDUTF_VALID_UTF8_TO_UTF16_H
-#define SIMDUTF_VALID_UTF8_TO_UTF16_H
+/*
+    This upcoming macro (SIMDUTF_ICELAKE_TRANSCODE16) takes 16 + 4 bytes (of a
+   UTF-8 string) and loads all possible 4-byte substring into an AVX512
+   register.
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf8_to_utf16 {
+    For example if we have bytes abcdefgh... we create following 32-bit lanes
 
-template <endianness big_endian>
-inline size_t convert_valid(const char *buf, size_t len,
-                            char16_t *utf16_output) {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char16_t *start{utf16_output};
-  while (pos < len) {
-    // try to convert the next block of 8 ASCII bytes
-    if (pos + 8 <=
-        len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 8;
-        while (pos < final_pos) {
-          *utf16_output++ = !match_system(big_endian)
-                                ? char16_t(utf16::swap_bytes(buf[pos]))
-                                : char16_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf16_output++ = !match_system(big_endian)
-                            ? char16_t(utf16::swap_bytes(leading_byte))
-                            : char16_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if (pos + 1 >= len) {
-        break;
-      } // minimal bound checking
-      uint16_t code_point = uint16_t(((leading_byte & 0b00011111) << 6) |
-                                     (data[pos + 1] & 0b00111111));
-      if (!match_system(big_endian)) {
-        code_point = utf16::swap_bytes(uint16_t(code_point));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if (pos + 2 >= len) {
-        break;
-      } // minimal bound checking
-      uint16_t code_point = uint16_t(((leading_byte & 0b00001111) << 12) |
-                                     ((data[pos + 1] & 0b00111111) << 6) |
-                                     (data[pos + 2] & 0b00111111));
-      if (!match_system(big_endian)) {
-        code_point = utf16::swap_bytes(uint16_t(code_point));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if (pos + 3 >= len) {
-        break;
-      } // minimal bound checking
-      uint32_t code_point = ((leading_byte & 0b00000111) << 18) |
-                            ((data[pos + 1] & 0b00111111) << 12) |
-                            ((data[pos + 2] & 0b00111111) << 6) |
-                            (data[pos + 3] & 0b00111111);
-      code_point -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
-      }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
-      pos += 4;
-    } else {
-      // we may have a continuation but we do not do error checking
-      return 0;
-    }
+    [abcd|bcde|cdef|defg|efgh|...]
+     ^                          ^
+     byte 0 of reg              byte 63 of reg
+*/
+/** pshufb
+        # lane{0,1,2} have got bytes: [  0,  1,  2,  3,  4,  5,  6,  8,  9, 10,
+   11, 12, 13, 14, 15] # lane3 has got bytes:        [ 16, 17, 18, 19,  4,  5,
+   6,  8,  9, 10, 11, 12, 13, 14, 15]
+
+        expand_ver2 = [
+            # lane 0:
+            0, 1, 2, 3,
+            1, 2, 3, 4,
+            2, 3, 4, 5,
+            3, 4, 5, 6,
+
+            # lane 1:
+            4, 5, 6, 7,
+            5, 6, 7, 8,
+            6, 7, 8, 9,
+            7, 8, 9, 10,
+
+            # lane 2:
+             8,  9, 10, 11,
+             9, 10, 11, 12,
+            10, 11, 12, 13,
+            11, 12, 13, 14,
+
+            # lane 3 order: 13, 14, 15, 16 14, 15, 16, 17, 15, 16, 17, 18, 16,
+   17, 18, 19 12, 13, 14, 15, 13, 14, 15,  0, 14, 15,  0,  1, 15,  0,  1,  2,
+        ]
+*/
+
+#define SIMDUTF_ICELAKE_TRANSCODE16(LANE0, LANE1, MASKED)                      \
+  {                                                                            \
+    const __m512i merged = _mm512_mask_mov_epi32(LANE0, 0x1000, LANE1);        \
+    const __m512i expand_ver2 = _mm512_setr_epi64(                             \
+        0x0403020103020100, 0x0605040305040302, 0x0807060507060504,            \
+        0x0a09080709080706, 0x0c0b0a090b0a0908, 0x0e0d0c0b0d0c0b0a,            \
+        0x000f0e0d0f0e0d0c, 0x0201000f01000f0e);                               \
+    const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);            \
+                                                                               \
+    __mmask16 leading_bytes;                                                   \
+    const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);                       \
+    const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);                   \
+    const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);                       \
+    leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);                 \
+                                                                               \
+    __m512i char_class;                                                        \
+    char_class = _mm512_srli_epi32(input, 4);                                  \
+    /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */                     \
+    const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);                       \
+    const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);                 \
+    char_class =                                                               \
+        _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea); \
+                                                                               \
+    const int valid_count = static_cast<int>(count_ones(leading_bytes));       \
+    const __m512i utf32 = expanded_utf8_to_utf32(char_class, input);           \
+                                                                               \
+    const __m512i out = _mm512_mask_compress_epi32(_mm512_setzero_si512(),     \
+                                                   leading_bytes, utf32);      \
+                                                                               \
+    if (UTF32) {                                                               \
+      if (MASKED) {                                                            \
+        const __mmask16 valid = uint16_t((1 << valid_count) - 1);              \
+        _mm512_mask_storeu_epi32((__m512i *)output, valid, out);               \
+      } else {                                                                 \
+        _mm512_storeu_si512((__m512i *)output, out);                           \
+      }                                                                        \
+      output += valid_count;                                                   \
+    } else {                                                                   \
+      if (MASKED) {                                                            \
+        output += utf32_to_utf16_masked<big_endian>(                           \
+            byteflip, out, valid_count, reinterpret_cast<char16_t *>(output)); \
+      } else {                                                                 \
+        output += utf32_to_utf16<big_endian>(                                  \
+            byteflip, out, valid_count, reinterpret_cast<char16_t *>(output)); \
+      }                                                                        \
+    }                                                                          \
   }
-  return utf16_output - start;
-}
 
-} // namespace utf8_to_utf16
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+#define SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(INPUT, VALID_COUNT, MASKED)       \
+  {                                                                            \
+    if (UTF32) {                                                               \
+      if (MASKED) {                                                            \
+        const __mmask16 valid_mask = uint16_t((1 << VALID_COUNT) - 1);         \
+        _mm512_mask_storeu_epi32((__m512i *)output, valid_mask, INPUT);        \
+      } else {                                                                 \
+        _mm512_storeu_si512((__m512i *)output, INPUT);                         \
+      }                                                                        \
+      output += VALID_COUNT;                                                   \
+    } else {                                                                   \
+      if (MASKED) {                                                            \
+        output += utf32_to_utf16_masked<big_endian>(                           \
+            byteflip, INPUT, VALID_COUNT,                                      \
+            reinterpret_cast<char16_t *>(output));                             \
+      } else {                                                                 \
+        output +=                                                              \
+            utf32_to_utf16<big_endian>(byteflip, INPUT, VALID_COUNT,           \
+                                       reinterpret_cast<char16_t *>(output));  \
+      }                                                                        \
+    }                                                                          \
+  }
 
-#endif
-/* end file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
-/* begin file src/scalar/utf8_to_utf16/utf8_to_utf16.h */
-#ifndef SIMDUTF_UTF8_TO_UTF16_H
-#define SIMDUTF_UTF8_TO_UTF16_H
+#define SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)                       \
+  if (UTF32) {                                                                 \
+    const __m128i t0 = _mm512_castsi512_si128(utf8);                           \
+    const __m128i t1 = _mm512_extracti32x4_epi32(utf8, 1);                     \
+    const __m128i t2 = _mm512_extracti32x4_epi32(utf8, 2);                     \
+    const __m128i t3 = _mm512_extracti32x4_epi32(utf8, 3);                     \
+    _mm512_storeu_si512((__m512i *)(output + 0 * 16),                          \
+                        _mm512_cvtepu8_epi32(t0));                             \
+    _mm512_storeu_si512((__m512i *)(output + 1 * 16),                          \
+                        _mm512_cvtepu8_epi32(t1));                             \
+    _mm512_storeu_si512((__m512i *)(output + 2 * 16),                          \
+                        _mm512_cvtepu8_epi32(t2));                             \
+    _mm512_storeu_si512((__m512i *)(output + 3 * 16),                          \
+                        _mm512_cvtepu8_epi32(t3));                             \
+  } else {                                                                     \
+    const __m256i h0 = _mm512_castsi512_si256(utf8);                           \
+    const __m256i h1 = _mm512_extracti64x4_epi64(utf8, 1);                     \
+    if (big_endian) {                                                          \
+      _mm512_storeu_si512(                                                     \
+          (__m512i *)(output + 0 * 16),                                        \
+          _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h0), byteflip));            \
+      _mm512_storeu_si512(                                                     \
+          (__m512i *)(output + 2 * 16),                                        \
+          _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h1), byteflip));            \
+    } else {                                                                   \
+      _mm512_storeu_si512((__m512i *)(output + 0 * 16),                        \
+                          _mm512_cvtepu8_epi16(h0));                           \
+      _mm512_storeu_si512((__m512i *)(output + 2 * 16),                        \
+                          _mm512_cvtepu8_epi16(h1));                           \
+    }                                                                          \
+  }
+/* end file src/icelake/icelake_macros.inl.cpp */
+/* begin file src/icelake/icelake_from_valid_utf8.inl.cpp */
+// file included directly
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf8_to_utf16 {
+// File contains conversion procedure from VALID UTF-8 strings.
 
-template <endianness big_endian>
-inline size_t convert(const char *buf, size_t len, char16_t *utf16_output) {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char16_t *start{utf16_output};
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <=
-        len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 16;
-        while (pos < final_pos) {
-          *utf16_output++ = !match_system(big_endian)
-                                ? char16_t(utf16::swap_bytes(buf[pos]))
-                                : char16_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
+/*
+    valid_utf8_to_fixed_length converts a valid UTF-8 string into UTF-32.
 
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf16_output++ = !match_system(big_endian)
-                            ? char16_t(utf16::swap_bytes(leading_byte))
-                            : char16_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if (pos + 1 >= len) {
-        return 0;
-      } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
-      // range check
-      uint32_t code_point =
-          (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if (code_point < 0x80 || 0x7ff < code_point) {
-        return 0;
-      }
-      if (!match_system(big_endian)) {
-        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if (pos + 2 >= len) {
-        return 0;
-      } // minimal bound checking
+    The `OUTPUT` template type decides what to do with UTF-32: store
+    it directly or convert into UTF-16 (with AVX512).
 
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
-                            (data[pos + 1] & 0b00111111) << 6 |
-                            (data[pos + 2] & 0b00111111);
-      if (code_point < 0x800 || 0xffff < code_point ||
-          (0xd7ff < code_point && code_point < 0xe000)) {
-        return 0;
-      }
-      if (!match_system(big_endian)) {
-        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if (pos + 3 >= len) {
-        return 0;
-      } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
+    Input:
+    - str           - valid UTF-8 string
+    - len           - string length
+    - out_buffer    - output buffer
 
-      // range check
-      uint32_t code_point = (leading_byte & 0b00000111) << 18 |
-                            (data[pos + 1] & 0b00111111) << 12 |
-                            (data[pos + 2] & 0b00111111) << 6 |
-                            (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff || 0x10ffff < code_point) {
-        return 0;
-      }
-      code_point -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
-      }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
-      pos += 4;
-    } else {
-      return 0;
-    }
-  }
-  return utf16_output - start;
-}
+    Result:
+    - pair.first    - the first unprocessed input byte
+    - pair.second   - the first unprocessed output word
+*/
+template <endianness big_endian, typename OUTPUT>
+std::pair<const char *, OUTPUT *>
+valid_utf8_to_fixed_length(const char *str, size_t len, OUTPUT *dwords) {
+  constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
+  constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
+  static_assert(
+      UTF32 or UTF16,
+      "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
+  static_assert(!(UTF32 and big_endian),
+                "we do not currently support big-endian UTF-32");
 
-template <endianness big_endian>
-inline result convert_with_errors(const char *buf, size_t len,
-                                  char16_t *utf16_output) {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char16_t *start{utf16_output};
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <=
-        len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 16;
-        while (pos < final_pos) {
-          *utf16_output++ = !match_system(big_endian)
-                                ? char16_t(utf16::swap_bytes(buf[pos]))
-                                : char16_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf16_output++ = !match_system(big_endian)
-                            ? char16_t(utf16::swap_bytes(leading_byte))
-                            : char16_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if (pos + 1 >= len) {
-        return result(error_code::TOO_SHORT, pos);
-      } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      // range check
-      uint32_t code_point =
-          (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if (code_point < 0x80 || 0x7ff < code_point) {
-        return result(error_code::OVERLONG, pos);
-      }
-      if (!match_system(big_endian)) {
-        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if (pos + 2 >= len) {
-        return result(error_code::TOO_SHORT, pos);
-      } // minimal bound checking
+  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  const char *ptr = str;
+  const char *end = ptr + len;
 
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
-                            (data[pos + 1] & 0b00111111) << 6 |
-                            (data[pos + 2] & 0b00111111);
-      if ((code_point < 0x800) || (0xffff < code_point)) {
-        return result(error_code::OVERLONG, pos);
-      }
-      if (0xd7ff < code_point && code_point < 0xe000) {
-        return result(error_code::SURROGATE, pos);
-      }
-      if (!match_system(big_endian)) {
-        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if (pos + 3 >= len) {
-        return result(error_code::TOO_SHORT, pos);
-      } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
+  OUTPUT *output = dwords;
+  /**
+   * In the main loop, we consume 64 bytes per iteration,
+   * but we access 64 + 4 bytes.
+   * We check for ptr + 64 + 64 <= end because
+   * we want to be do maskless writes without overruns.
+   */
+  while (end - ptr >= 64 + 4) {
+    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+    const __m512i v_80 = _mm512_set1_epi8(char(0x80));
+    const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
+    if (ascii == 0) {
+      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
+      output += 64;
+      ptr += 64;
+      continue;
+    }
 
-      // range check
-      uint32_t code_point = (leading_byte & 0b00000111) << 18 |
-                            (data[pos + 1] & 0b00111111) << 12 |
-                            (data[pos + 2] & 0b00111111) << 6 |
-                            (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff) {
-        return result(error_code::OVERLONG, pos);
-      }
-      if (0x10ffff < code_point) {
-        return result(error_code::TOO_LARGE, pos);
-      }
-      code_point -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
-      }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
-      pos += 4;
+    const __m512i lane0 = broadcast_epi128<0>(utf8);
+    const __m512i lane1 = broadcast_epi128<1>(utf8);
+    int valid_count0;
+    __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
+    const __m512i lane2 = broadcast_epi128<2>(utf8);
+    int valid_count1;
+    __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
+    if (valid_count0 + valid_count1 <= 16) {
+      vec0 = _mm512_mask_expand_epi32(
+          vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+      valid_count0 += valid_count1;
+      vec0 = expand_utf8_to_utf32(vec0);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
     } else {
-      // we either have too many continuation bytes or an invalid leading byte
-      if ((leading_byte & 0b11000000) == 0b10000000) {
-        return result(error_code::TOO_LONG, pos);
-      } else {
-        return result(error_code::HEADER_BITS, pos);
-      }
+      vec0 = expand_utf8_to_utf32(vec0);
+      vec1 = expand_utf8_to_utf32(vec1);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
     }
-  }
-  return result(error_code::SUCCESS, utf16_output - start);
-}
-
-/**
- * When rewind_and_convert_with_errors is called, we are pointing at 'buf' and
- * we have up to len input bytes left, and we encountered some error. It is
- * possible that the error is at 'buf' exactly, but it could also be in the
- * previous bytes  (up to 3 bytes back).
- *
- * prior_bytes indicates how many bytes, prior to 'buf' may belong to the
- * current memory section and can be safely accessed. We prior_bytes to access
- * safely up to three bytes before 'buf'.
- *
- * The caller is responsible to ensure that len > 0.
- *
- * If the error is believed to have occurred prior to 'buf', the count value
- * contain in the result will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
- */
-template <endianness endian>
-inline result rewind_and_convert_with_errors(size_t prior_bytes,
-                                             const char *buf, size_t len,
-                                             char16_t *utf16_output) {
-  size_t extra_len{0};
-  // We potentially need to go back in time and find a leading byte.
-  // In theory '3' would be sufficient, but sometimes the error can go back
-  // quite far.
-  size_t how_far_back = prior_bytes;
-  // size_t how_far_back = 3; // 3 bytes in the past + current position
-  // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
-  bool found_leading_bytes{false};
-  // important: it is i <= how_far_back and not 'i < how_far_back'.
-  for (size_t i = 0; i <= how_far_back; i++) {
-    unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
-    found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
-    if (found_leading_bytes) {
-      if (i > 0 && byte < 128) {
-        // If we had to go back and the leading byte is ascii
-        // then we can stop right away.
-        return result(error_code::TOO_LONG, 0 - i + 1);
-      }
-      buf -= i;
-      extra_len = i;
-      break;
+    const __m512i lane3 = broadcast_epi128<3>(utf8);
+    int valid_count2;
+    __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
+    uint32_t tmp1;
+    ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
+    const __m512i lane4 = _mm512_set1_epi32(tmp1);
+    int valid_count3;
+    __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
+    if (valid_count2 + valid_count3 <= 16) {
+      vec2 = _mm512_mask_expand_epi32(
+          vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
+      valid_count2 += valid_count3;
+      vec2 = expand_utf8_to_utf32(vec2);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+    } else {
+      vec2 = expand_utf8_to_utf32(vec2);
+      vec3 = expand_utf8_to_utf32(vec3);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
     }
+    ptr += 4 * 16;
   }
-  //
-  // It is possible for this function to return a negative count in its result.
-  // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
-  // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
-  // unsigned integral type of the result of the sizeof operator
-  //
-  // An unsigned type will simply wrap round arithmetically (well defined).
-  //
-  if (!found_leading_bytes) {
-    // If how_far_back == 3, we may have four consecutive continuation bytes!!!
-    // [....] [continuation] [continuation] [continuation] | [buf is
-    // continuation] Or we possibly have a stream that does not start with a
-    // leading byte.
-    return result(error_code::TOO_LONG, 0 - how_far_back);
-  }
-  result res = convert_with_errors<endian>(buf, len + extra_len, utf16_output);
-  if (res.error) {
-    res.count -= extra_len;
+
+  if (end - ptr >= 64) {
+    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+    const __m512i v_80 = _mm512_set1_epi8(char(0x80));
+    const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
+    if (ascii == 0) {
+      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
+      output += 64;
+      ptr += 64;
+    } else {
+      const __m512i lane0 = broadcast_epi128<0>(utf8);
+      const __m512i lane1 = broadcast_epi128<1>(utf8);
+      int valid_count0;
+      __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
+      const __m512i lane2 = broadcast_epi128<2>(utf8);
+      int valid_count1;
+      __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
+      if (valid_count0 + valid_count1 <= 16) {
+        vec0 = _mm512_mask_expand_epi32(
+            vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+        valid_count0 += valid_count1;
+        vec0 = expand_utf8_to_utf32(vec0);
+        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+      } else {
+        vec0 = expand_utf8_to_utf32(vec0);
+        vec1 = expand_utf8_to_utf32(vec1);
+        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
+      }
+
+      const __m512i lane3 = broadcast_epi128<3>(utf8);
+      SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
+
+      ptr += 3 * 16;
+    }
   }
-  return res;
+  return {ptr, output};
 }
 
-} // namespace utf8_to_utf16
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+using utf8_to_utf16_result = std::pair<const char *, char16_t *>;
+/* end file src/icelake/icelake_from_valid_utf8.inl.cpp */
+/* begin file src/icelake/icelake_utf8_validation.inl.cpp */
+// file included directly
 
-#endif
-/* end file src/scalar/utf8_to_utf16/utf8_to_utf16.h */
+simdutf_really_inline __m512i check_special_cases(__m512i input,
+                                                  const __m512i prev1) {
+  __m512i mask1 = _mm512_setr_epi64(0x0202020202020202, 0x4915012180808080,
+                                    0x0202020202020202, 0x4915012180808080,
+                                    0x0202020202020202, 0x4915012180808080,
+                                    0x0202020202020202, 0x4915012180808080);
+  const __m512i v_0f = _mm512_set1_epi8(0x0f);
+  __m512i index1 = _mm512_and_si512(_mm512_srli_epi16(prev1, 4), v_0f);
 
-/* begin file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */
-#ifndef SIMDUTF_VALID_UTF8_TO_UTF32_H
-#define SIMDUTF_VALID_UTF8_TO_UTF32_H
+  __m512i byte_1_high = _mm512_shuffle_epi8(mask1, index1);
+  __m512i mask2 = _mm512_setr_epi64(0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
+                                    0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
+                                    0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
+                                    0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb);
+  __m512i index2 = _mm512_and_si512(prev1, v_0f);
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf8_to_utf32 {
+  __m512i byte_1_low = _mm512_shuffle_epi8(mask2, index2);
+  __m512i mask3 =
+      _mm512_setr_epi64(0x101010101010101, 0x1010101babaaee6, 0x101010101010101,
+                        0x1010101babaaee6, 0x101010101010101, 0x1010101babaaee6,
+                        0x101010101010101, 0x1010101babaaee6);
+  __m512i index3 = _mm512_and_si512(_mm512_srli_epi16(input, 4), v_0f);
+  __m512i byte_2_high = _mm512_shuffle_epi8(mask3, index3);
+  return _mm512_ternarylogic_epi64(byte_1_high, byte_1_low, byte_2_high, 128);
+}
 
-inline size_t convert_valid(const char *buf, size_t len,
-                            char32_t *utf32_output) {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char32_t *start{utf32_output};
-  while (pos < len) {
-    // try to convert the next block of 8 ASCII bytes
-    if (pos + 8 <=
-        len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 8;
-        while (pos < final_pos) {
-          *utf32_output++ = char32_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf32_output++ = char32_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8
-      if (pos + 1 >= len) {
-        break;
-      } // minimal bound checking
-      *utf32_output++ = char32_t(((leading_byte & 0b00011111) << 6) |
-                                 (data[pos + 1] & 0b00111111));
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8
-      if (pos + 2 >= len) {
-        break;
-      } // minimal bound checking
-      *utf32_output++ = char32_t(((leading_byte & 0b00001111) << 12) |
-                                 ((data[pos + 1] & 0b00111111) << 6) |
-                                 (data[pos + 2] & 0b00111111));
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if (pos + 3 >= len) {
-        break;
-      } // minimal bound checking
-      uint32_t code_word = ((leading_byte & 0b00000111) << 18) |
-                           ((data[pos + 1] & 0b00111111) << 12) |
-                           ((data[pos + 2] & 0b00111111) << 6) |
-                           (data[pos + 3] & 0b00111111);
-      *utf32_output++ = char32_t(code_word);
-      pos += 4;
-    } else {
-      // we may have a continuation but we do not do error checking
-      return 0;
-    }
-  }
-  return utf32_output - start;
+simdutf_really_inline __m512i check_multibyte_lengths(const __m512i input,
+                                                      const __m512i prev_input,
+                                                      const __m512i sc) {
+  __m512i prev2 = prev<2>(input, prev_input);
+  __m512i prev3 = prev<3>(input, prev_input);
+  __m512i is_third_byte = _mm512_subs_epu8(
+      prev2, _mm512_set1_epi8(0b11100000u - 1)); // Only 111_____ will be > 0
+  __m512i is_fourth_byte = _mm512_subs_epu8(
+      prev3, _mm512_set1_epi8(0b11110000u - 1)); // Only 1111____ will be > 0
+  __m512i is_third_or_fourth_byte =
+      _mm512_or_si512(is_third_byte, is_fourth_byte);
+  const __m512i v_7f = _mm512_set1_epi8(char(0x7f));
+  is_third_or_fourth_byte = _mm512_adds_epu8(v_7f, is_third_or_fourth_byte);
+  // We want to compute (is_third_or_fourth_byte AND v80) XOR sc.
+  const __m512i v_80 = _mm512_set1_epi8(char(0x80));
+  return _mm512_ternarylogic_epi32(is_third_or_fourth_byte, v_80, sc,
+                                   0b1101010);
+  //__m512i is_third_or_fourth_byte_mask =
+  //_mm512_and_si512(is_third_or_fourth_byte, v_80); return
+  // _mm512_xor_si512(is_third_or_fourth_byte_mask, sc);
+}
+//
+// Return nonzero if there are incomplete multibyte characters at the end of the
+// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
+//
+simdutf_really_inline __m512i is_incomplete(const __m512i input) {
+  // If the previous input's last 3 bytes match this, they're too short (they
+  // ended at EOF):
+  // ... 1111____ 111_____ 11______
+  __m512i max_value = _mm512_setr_epi64(0xffffffffffffffff, 0xffffffffffffffff,
+                                        0xffffffffffffffff, 0xffffffffffffffff,
+                                        0xffffffffffffffff, 0xffffffffffffffff,
+                                        0xffffffffffffffff, 0xbfdfefffffffffff);
+  return _mm512_subs_epu8(input, max_value);
 }
 
-} // namespace utf8_to_utf32
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+struct avx512_utf8_checker {
+  // If this is nonzero, there has been a UTF-8 error.
+  __m512i error{};
 
-#endif
-/* end file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */
-/* begin file src/scalar/utf8_to_utf32/utf8_to_utf32.h */
-#ifndef SIMDUTF_UTF8_TO_UTF32_H
-#define SIMDUTF_UTF8_TO_UTF32_H
+  // The last input we received
+  __m512i prev_input_block{};
+  // Whether the last input we received was incomplete (used for ASCII fast
+  // path)
+  __m512i prev_incomplete{};
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf8_to_utf32 {
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const __m512i input,
+                                              const __m512i prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    __m512i prev1 = prev<1>(input, prev_input);
+    __m512i sc = check_special_cases(input, prev1);
+    this->error = _mm512_or_si512(
+        check_multibyte_lengths(input, prev_input, sc), this->error);
+  }
 
-inline size_t convert(const char *buf, size_t len, char32_t *utf32_output) {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char32_t *start{utf32_output};
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <=
-        len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 16;
-        while (pos < final_pos) {
-          *utf32_output++ = char32_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
+  // The only problem that can happen at EOF is that a multibyte character is
+  // too short or a byte value too large in the last bytes: check_special_cases
+  // only checks for bytes too large in the first of two bytes.
+  simdutf_really_inline void check_eof() {
+    // If the previous block had incomplete UTF-8 characters at the end, an
+    // ASCII block can't possibly finish them.
+    this->error = _mm512_or_si512(this->error, this->prev_incomplete);
+  }
+
+  // returns true if ASCII.
+  simdutf_really_inline bool check_next_input(const __m512i input) {
+    const __m512i v_80 = _mm512_set1_epi8(char(0x80));
+    const __mmask64 ascii = _mm512_test_epi8_mask(input, v_80);
+    if (ascii == 0) {
+      this->error = _mm512_or_si512(this->error, this->prev_incomplete);
+      return true;
+    } else {
+      this->check_utf8_bytes(input, this->prev_input_block);
+      this->prev_incomplete = is_incomplete(input);
+      this->prev_input_block = input;
+      return false;
     }
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf32_output++ = char32_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8
-      if (pos + 1 >= len) {
-        return 0;
-      } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
-      // range check
-      uint32_t code_point =
-          (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if (code_point < 0x80 || 0x7ff < code_point) {
-        return 0;
-      }
-      *utf32_output++ = char32_t(code_point);
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8
-      if (pos + 2 >= len) {
-        return 0;
-      } // minimal bound checking
+  }
+  // do not forget to call check_eof!
+  simdutf_really_inline bool errors() const {
+    return _mm512_test_epi8_mask(this->error, this->error) != 0;
+  }
+}; // struct avx512_utf8_checker
+/* end file src/icelake/icelake_utf8_validation.inl.cpp */
+/* begin file src/icelake/icelake_from_utf8.inl.cpp */
+// file included directly
 
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
-                            (data[pos + 1] & 0b00111111) << 6 |
-                            (data[pos + 2] & 0b00111111);
-      if (code_point < 0x800 || 0xffff < code_point ||
-          (0xd7ff < code_point && code_point < 0xe000)) {
-        return 0;
-      }
-      *utf32_output++ = char32_t(code_point);
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if (pos + 3 >= len) {
-        return 0;
-      } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
+// File contains conversion procedure from possibly invalid UTF-8 strings.
 
-      // range check
-      uint32_t code_point = (leading_byte & 0b00000111) << 18 |
-                            (data[pos + 1] & 0b00111111) << 12 |
-                            (data[pos + 2] & 0b00111111) << 6 |
-                            (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff || 0x10ffff < code_point) {
-        return 0;
-      }
-      *utf32_output++ = char32_t(code_point);
-      pos += 4;
+/**
+ * Attempts to convert up to len 1-byte code units from in (in UTF-8 format) to
+ * out.
+ * Returns the position of the input and output after the processing is
+ * completed. Upon error, the output is set to null.
+ */
+
+template <endianness big_endian>
+utf8_to_utf16_result
+fast_avx512_convert_utf8_to_utf16(const char *in, size_t len, char16_t *out) {
+  const char *const final_in = in + len;
+  bool result = true;
+  while (result) {
+    if (final_in - in >= 64) {
+      result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(
+          in, out, final_in - in);
+    } else if (in < final_in) {
+      result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(
+          in, out, final_in - in);
     } else {
-      return 0;
+      break;
     }
   }
-  return utf32_output - start;
+  if (!result) {
+    out = nullptr;
+  }
+  return std::make_pair(in, out);
 }
 
-inline result convert_with_errors(const char *buf, size_t len,
-                                  char32_t *utf32_output) {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char32_t *start{utf32_output};
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <=
-        len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 16;
-        while (pos < final_pos) {
-          *utf32_output++ = char32_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
+template <endianness big_endian>
+simdutf::result fast_avx512_convert_utf8_to_utf16_with_errors(const char *in,
+                                                              size_t len,
+                                                              char16_t *out) {
+  const char *const init_in = in;
+  const char16_t *const init_out = out;
+  const char *const final_in = in + len;
+  bool result = true;
+  while (result) {
+    if (final_in - in >= 64) {
+      result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(
+          in, out, final_in - in);
+    } else if (in < final_in) {
+      result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(
+          in, out, final_in - in);
+    } else {
+      break;
     }
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf32_output++ = char32_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8
-      if (pos + 1 >= len) {
-        return result(error_code::TOO_SHORT, pos);
-      } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      // range check
-      uint32_t code_point =
-          (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if (code_point < 0x80 || 0x7ff < code_point) {
-        return result(error_code::OVERLONG, pos);
+  }
+  if (!result) {
+    size_t pos = size_t(in - init_in);
+    if (pos < len && (init_in[pos] & 0xc0) == 0x80 && pos >= 64) {
+      // We must check whether we are the fourth continuation byte
+      bool c1 = (init_in[pos - 1] & 0xc0) == 0x80;
+      bool c2 = (init_in[pos - 2] & 0xc0) == 0x80;
+      bool c3 = (init_in[pos - 3] & 0xc0) == 0x80;
+      if (c1 && c2 && c3) {
+        return {simdutf::TOO_LONG, pos};
       }
-      *utf32_output++ = char32_t(code_point);
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8
-      if (pos + 2 >= len) {
-        return result(error_code::TOO_SHORT, pos);
-      } // minimal bound checking
+    }
+    // rewind_and_convert_with_errors will seek a potential error from in
+    // onward, with the ability to go back up to in - init_in bytes, and read
+    // final_in - in bytes forward.
+    simdutf::result res =
+        scalar::utf8_to_utf16::rewind_and_convert_with_errors<big_endian>(
+            in - init_in, in, final_in - in, out);
+    res.count += (in - init_in);
+    return res;
+  } else {
+    return simdutf::result(error_code::SUCCESS, out - init_out);
+  }
+}
 
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
-                            (data[pos + 1] & 0b00111111) << 6 |
-                            (data[pos + 2] & 0b00111111);
-      if (code_point < 0x800 || 0xffff < code_point) {
-        return result(error_code::OVERLONG, pos);
-      }
-      if (0xd7ff < code_point && code_point < 0xe000) {
-        return result(error_code::SURROGATE, pos);
-      }
-      *utf32_output++ = char32_t(code_point);
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if (pos + 3 >= len) {
-        return result(error_code::TOO_SHORT, pos);
-      } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
+template <endianness big_endian, typename OUTPUT>
+// todo: replace with the utf-8 to utf-16 routine adapted to utf-32. This code
+// is legacy.
+std::pair<const char *, OUTPUT *>
+validating_utf8_to_fixed_length(const char *str, size_t len, OUTPUT *dwords) {
+  constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
+  constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
+  static_assert(
+      UTF32 or UTF16,
+      "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
+  static_assert(!(UTF32 and big_endian),
+                "we do not currently support big-endian UTF-32");
 
-      // range check
-      uint32_t code_point = (leading_byte & 0b00000111) << 18 |
-                            (data[pos + 1] & 0b00111111) << 12 |
-                            (data[pos + 2] & 0b00111111) << 6 |
-                            (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff) {
-        return result(error_code::OVERLONG, pos);
-      }
-      if (0x10ffff < code_point) {
-        return result(error_code::TOO_LARGE, pos);
-      }
-      *utf32_output++ = char32_t(code_point);
-      pos += 4;
+  const char *ptr = str;
+  const char *end = ptr + len;
+  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  OUTPUT *output = dwords;
+  avx512_utf8_checker checker{};
+  /**
+   * In the main loop, we consume 64 bytes per iteration,
+   * but we access 64 + 4 bytes.
+   * We use masked writes to avoid overruns, see
+   * https://github.com/simdutf/simdutf/issues/471
+   */
+  while (end - ptr >= 64 + 4) {
+    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+    if (checker.check_next_input(utf8)) {
+      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
+      output += 64;
+      ptr += 64;
+      continue;
+    }
+    const __m512i lane0 = broadcast_epi128<0>(utf8);
+    const __m512i lane1 = broadcast_epi128<1>(utf8);
+    int valid_count0;
+    __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
+    const __m512i lane2 = broadcast_epi128<2>(utf8);
+    int valid_count1;
+    __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
+    if (valid_count0 + valid_count1 <= 16) {
+      vec0 = _mm512_mask_expand_epi32(
+          vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+      valid_count0 += valid_count1;
+      vec0 = expand_utf8_to_utf32(vec0);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+    } else {
+      vec0 = expand_utf8_to_utf32(vec0);
+      vec1 = expand_utf8_to_utf32(vec1);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
+    }
+    const __m512i lane3 = broadcast_epi128<3>(utf8);
+    int valid_count2;
+    __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
+    uint32_t tmp1;
+    ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
+    const __m512i lane4 = _mm512_set1_epi32(tmp1);
+    int valid_count3;
+    __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
+    if (valid_count2 + valid_count3 <= 16) {
+      vec2 = _mm512_mask_expand_epi32(
+          vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
+      valid_count2 += valid_count3;
+      vec2 = expand_utf8_to_utf32(vec2);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
     } else {
-      // we either have too many continuation bytes or an invalid leading byte
-      if ((leading_byte & 0b11000000) == 0b10000000) {
-        return result(error_code::TOO_LONG, pos);
-      } else {
-        return result(error_code::HEADER_BITS, pos);
-      }
+      vec2 = expand_utf8_to_utf32(vec2);
+      vec3 = expand_utf8_to_utf32(vec3);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
     }
+    ptr += 4 * 16;
   }
-  return result(error_code::SUCCESS, utf32_output - start);
-}
+  const char *validatedptr = ptr; // validated up to ptr
 
-/**
- * When rewind_and_convert_with_errors is called, we are pointing at 'buf' and
- * we have up to len input bytes left, and we encountered some error. It is
- * possible that the error is at 'buf' exactly, but it could also be in the
- * previous bytes location (up to 3 bytes back).
- *
- * prior_bytes indicates how many bytes, prior to 'buf' may belong to the
- * current memory section and can be safely accessed. We prior_bytes to access
- * safely up to three bytes before 'buf'.
- *
- * The caller is responsible to ensure that len > 0.
- *
- * If the error is believed to have occurred prior to 'buf', the count value
- * contain in the result will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
- */
-inline result rewind_and_convert_with_errors(size_t prior_bytes,
-                                             const char *buf, size_t len,
-                                             char32_t *utf32_output) {
-  size_t extra_len{0};
-  // We potentially need to go back in time and find a leading byte.
-  size_t how_far_back = 3; // 3 bytes in the past + current position
-  if (how_far_back > prior_bytes) {
-    how_far_back = prior_bytes;
-  }
-  bool found_leading_bytes{false};
-  // important: it is i <= how_far_back and not 'i < how_far_back'.
-  for (size_t i = 0; i <= how_far_back; i++) {
-    unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
-    found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
-    if (found_leading_bytes) {
-      if (i > 0 && byte < 128) {
-        // If we had to go back and the leading byte is ascii
-        // then we can stop right away.
-        return result(error_code::TOO_LONG, 0 - i + 1);
+  // For the final pass, we validate 64 bytes, but we only transcode
+  // 3*16 bytes, so we may end up double-validating 16 bytes.
+  if (end - ptr >= 64) {
+    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+    if (checker.check_next_input(utf8)) {
+      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
+      output += 64;
+      ptr += 64;
+    } else {
+      const __m512i lane0 = broadcast_epi128<0>(utf8);
+      const __m512i lane1 = broadcast_epi128<1>(utf8);
+      int valid_count0;
+      __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
+      const __m512i lane2 = broadcast_epi128<2>(utf8);
+      int valid_count1;
+      __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
+      if (valid_count0 + valid_count1 <= 16) {
+        vec0 = _mm512_mask_expand_epi32(
+            vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+        valid_count0 += valid_count1;
+        vec0 = expand_utf8_to_utf32(vec0);
+        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+      } else {
+        vec0 = expand_utf8_to_utf32(vec0);
+        vec1 = expand_utf8_to_utf32(vec1);
+        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
       }
-      buf -= i;
-      extra_len = i;
-      break;
+
+      const __m512i lane3 = broadcast_epi128<3>(utf8);
+      SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
+
+      ptr += 3 * 16;
     }
+    validatedptr += 4 * 16;
   }
-  //
-  // It is possible for this function to return a negative count in its result.
-  // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
-  // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
-  // unsigned integral type of the result of the sizeof operator
-  //
-  // An unsigned type will simply wrap round arithmetically (well defined).
-  //
-  if (!found_leading_bytes) {
-    // If how_far_back == 3, we may have four consecutive continuation bytes!!!
-    // [....] [continuation] [continuation] [continuation] | [buf is
-    // continuation] Or we possibly have a stream that does not start with a
-    // leading byte.
-    return result(error_code::TOO_LONG, 0 - how_far_back);
+  if (end != validatedptr) {
+    const __m512i utf8 =
+        _mm512_maskz_loadu_epi8(~UINT64_C(0) >> (64 - (end - validatedptr)),
+                                (const __m512i *)validatedptr);
+    checker.check_next_input(utf8);
   }
-
-  result res = convert_with_errors(buf, len + extra_len, utf32_output);
-  if (res.error) {
-    res.count -= extra_len;
+  checker.check_eof();
+  if (checker.errors()) {
+    return {ptr, nullptr}; // We found an error.
   }
-  return res;
+  return {ptr, output};
 }
 
-} // namespace utf8_to_utf32
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
-
-#endif
-/* end file src/scalar/utf8_to_utf32/utf8_to_utf32.h */
+// Like validating_utf8_to_fixed_length but returns as soon as an error is
+// identified todo: replace with the utf-8 to utf-16 routine adapted to utf-32.
+// This code is legacy.
+template <endianness big_endian, typename OUTPUT>
+std::tuple<const char *, OUTPUT *, bool>
+validating_utf8_to_fixed_length_with_constant_checks(const char *str,
+                                                     size_t len,
+                                                     OUTPUT *dwords) {
+  constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
+  constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
+  static_assert(
+      UTF32 or UTF16,
+      "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
+  static_assert(!(UTF32 and big_endian),
+                "we do not currently support big-endian UTF-32");
 
-/* begin file src/scalar/latin1_to_utf16/latin1_to_utf16.h */
-#ifndef SIMDUTF_LATIN1_TO_UTF16_H
-#define SIMDUTF_LATIN1_TO_UTF16_H
+  const char *ptr = str;
+  const char *end = ptr + len;
+  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  OUTPUT *output = dwords;
+  avx512_utf8_checker checker{};
+  /**
+   * In the main loop, we consume 64 bytes per iteration,
+   * but we access 64 + 4 bytes.
+   */
+  while (end - ptr >= 4 + 64) {
+    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+    bool ascii = checker.check_next_input(utf8);
+    if (checker.errors()) {
+      return {ptr, output, false}; // We found an error.
+    }
+    if (ascii) {
+      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
+      output += 64;
+      ptr += 64;
+      continue;
+    }
+    const __m512i lane0 = broadcast_epi128<0>(utf8);
+    const __m512i lane1 = broadcast_epi128<1>(utf8);
+    int valid_count0;
+    __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
+    const __m512i lane2 = broadcast_epi128<2>(utf8);
+    int valid_count1;
+    __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
+    if (valid_count0 + valid_count1 <= 16) {
+      vec0 = _mm512_mask_expand_epi32(
+          vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+      valid_count0 += valid_count1;
+      vec0 = expand_utf8_to_utf32(vec0);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+    } else {
+      vec0 = expand_utf8_to_utf32(vec0);
+      vec1 = expand_utf8_to_utf32(vec1);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
+    }
+    const __m512i lane3 = broadcast_epi128<3>(utf8);
+    int valid_count2;
+    __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
+    uint32_t tmp1;
+    ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
+    const __m512i lane4 = _mm512_set1_epi32(tmp1);
+    int valid_count3;
+    __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
+    if (valid_count2 + valid_count3 <= 16) {
+      vec2 = _mm512_mask_expand_epi32(
+          vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
+      valid_count2 += valid_count3;
+      vec2 = expand_utf8_to_utf32(vec2);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+    } else {
+      vec2 = expand_utf8_to_utf32(vec2);
+      vec3 = expand_utf8_to_utf32(vec3);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
+    }
+    ptr += 4 * 16;
+  }
+  const char *validatedptr = ptr; // validated up to ptr
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace latin1_to_utf16 {
+  // For the final pass, we validate 64 bytes, but we only transcode
+  // 3*16 bytes, so we may end up double-validating 16 bytes.
+  if (end - ptr >= 64) {
+    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+    bool ascii = checker.check_next_input(utf8);
+    if (checker.errors()) {
+      return {ptr, output, false}; // We found an error.
+    }
+    if (ascii) {
+      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
+      output += 64;
+      ptr += 64;
+    } else {
+      const __m512i lane0 = broadcast_epi128<0>(utf8);
+      const __m512i lane1 = broadcast_epi128<1>(utf8);
+      int valid_count0;
+      __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
+      const __m512i lane2 = broadcast_epi128<2>(utf8);
+      int valid_count1;
+      __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
+      if (valid_count0 + valid_count1 <= 16) {
+        vec0 = _mm512_mask_expand_epi32(
+            vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+        valid_count0 += valid_count1;
+        vec0 = expand_utf8_to_utf32(vec0);
+        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+      } else {
+        vec0 = expand_utf8_to_utf32(vec0);
+        vec1 = expand_utf8_to_utf32(vec1);
+        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
+      }
 
-template <endianness big_endian>
-inline size_t convert(const char *buf, size_t len, char16_t *utf16_output) {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char16_t *start{utf16_output};
+      const __m512i lane3 = broadcast_epi128<3>(utf8);
+      SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
 
-  while (pos < len) {
-    uint16_t word =
-        uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
-    *utf16_output++ =
-        char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
-    pos++;
+      ptr += 3 * 16;
+    }
+    validatedptr += 4 * 16;
   }
-
-  return utf16_output - start;
+  if (end != validatedptr) {
+    const __m512i utf8 =
+        _mm512_maskz_loadu_epi8(~UINT64_C(0) >> (64 - (end - validatedptr)),
+                                (const __m512i *)validatedptr);
+    checker.check_next_input(utf8);
+  }
+  checker.check_eof();
+  if (checker.errors()) {
+    return {ptr, output, false}; // We found an error.
+  }
+  return {ptr, output, true};
 }
+/* end file src/icelake/icelake_from_utf8.inl.cpp */
+/* begin file src/icelake/icelake_convert_utf8_to_latin1.inl.cpp */
+// file included directly
 
-template <endianness big_endian>
-inline result convert_with_errors(const char *buf, size_t len,
-                                  char16_t *utf16_output) {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char16_t *start{utf16_output};
+// File contains conversion procedure from possibly invalid UTF-8 strings.
 
-  while (pos < len) {
-    uint16_t word =
-        uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
-    *utf16_output++ =
-        char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
-    pos++;
+template <bool is_remaining>
+simdutf_really_inline size_t process_block_from_utf8_to_latin1(
+    const char *buf, size_t len, char *latin_output, __m512i minus64,
+    __m512i one, __mmask64 *next_leading_ptr, __mmask64 *next_bit6_ptr) {
+  __mmask64 load_mask =
+      is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL;
+  __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf);
+  __mmask64 nonascii = _mm512_movepi8_mask(input);
+  if (nonascii == 0) {
+    if (*next_leading_ptr) { // If we ended with a leading byte, it is an error.
+      return 0;              // Indicates error
+    }
+    is_remaining
+        ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input)
+        : _mm512_storeu_si512((__m512i *)latin_output, input);
+    return len;
   }
 
-  return result(error_code::SUCCESS, utf16_output - start);
-}
-
-} // namespace latin1_to_utf16
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
-
-#endif
-/* end file src/scalar/latin1_to_utf16/latin1_to_utf16.h */
-/* begin file src/scalar/latin1_to_utf32/latin1_to_utf32.h */
-#ifndef SIMDUTF_LATIN1_TO_UTF32_H
-#define SIMDUTF_LATIN1_TO_UTF32_H
+  const __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64);
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace latin1_to_utf32 {
+  __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62));
+  __mmask64 invalid_leading_bytes =
+      _mm512_mask_cmpgt_epu8_mask(leading, highbits, one);
 
-inline size_t convert(const char *buf, size_t len, char32_t *utf32_output) {
-  const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
-  char32_t *start{utf32_output};
-  for (size_t i = 0; i < len; i++) {
-    *utf32_output++ = (char32_t)data[i];
+  if (invalid_leading_bytes) {
+    return 0; // Indicates error
   }
-  return utf32_output - start;
-}
 
-} // namespace latin1_to_utf32
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+  __mmask64 leading_shift = (leading << 1) | *next_leading_ptr;
 
-#endif
-/* end file src/scalar/latin1_to_utf32/latin1_to_utf32.h */
+  if ((nonascii ^ leading) != leading_shift) {
+    return 0; // Indicates error
+  }
 
-/* begin file src/scalar/utf8_to_latin1/utf8_to_latin1.h */
-#ifndef SIMDUTF_UTF8_TO_LATIN1_H
-#define SIMDUTF_UTF8_TO_LATIN1_H
+  const __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one);
+  input =
+      _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64);
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf8_to_latin1 {
+  __mmask64 retain = ~leading & load_mask;
+  __m512i output = _mm512_maskz_compress_epi8(retain, input);
+  int64_t written_out = count_ones(retain);
+  if (written_out == 0) {
+    return 0; // Indicates error
+  }
+  *next_bit6_ptr = bit6 >> 63;
+  *next_leading_ptr = leading >> 63;
 
-inline size_t convert(const char *buf, size_t len, char *latin_output) {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char *start{latin_output};
+  __mmask64 store_mask = ~UINT64_C(0) >> (64 - written_out);
 
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <=
-        len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000
-                           // 1000 1000 .... etc
-      if ((v & 0x8080808080808080) ==
-          0) { // if NONE of these are set, e.g. all of them are zero, then
-               // everything is ASCII
-        size_t final_pos = pos + 16;
-        while (pos < final_pos) {
-          *latin_output++ = char(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
+  _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output);
 
-    // suppose it is not an all ASCII byte sequence
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *latin_output++ = char(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) ==
-               0b11000000) { // the first three bits indicate:
-      // We have a two-byte UTF-8
-      if (pos + 1 >= len) {
-        return 0;
-      } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return 0;
-      } // checks if the next byte is a valid continuation byte in UTF-8. A
-        // valid continuation byte starts with 10.
-      // range check -
-      uint32_t code_point =
-          (leading_byte & 0b00011111) << 6 |
-          (data[pos + 1] &
-           0b00111111); // assembles the Unicode code point from the two bytes.
-                        // It does this by discarding the leading 110 and 10
-                        // bits from the two bytes, shifting the remaining bits
-                        // of the first byte, and then combining the results
-                        // with a bitwise OR operation.
-      if (code_point < 0x80 || 0xFF < code_point) {
-        return 0; // We only care about the range 129-255 which is Non-ASCII
-                  // latin1 characters. A code_point beneath 0x80 is invalid as
-                  // it is already covered by bytes whose leading bit is zero.
-      }
-      *latin_output++ = char(code_point);
-      pos += 2;
-    } else {
-      return 0;
-    }
-  }
-  return latin_output - start;
+  return written_out;
 }
 
-inline result convert_with_errors(const char *buf, size_t len,
-                                  char *latin_output) {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char *start{latin_output};
-
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <=
-        len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000
-                           // 1000 1000...etc
-      if ((v & 0x8080808080808080) ==
-          0) { // if NONE of these are set, e.g. all of them are zero, then
-               // everything is ASCII
-        size_t final_pos = pos + 16;
-        while (pos < final_pos) {
-          *latin_output++ = char(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    // suppose it is not an all ASCII byte sequence
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *latin_output++ = char(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) ==
-               0b11000000) { // the first three bits indicate:
-      // We have a two-byte UTF-8
-      if (pos + 1 >= len) {
-        return result(error_code::TOO_SHORT, pos);
-      } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      } // checks if the next byte is a valid continuation byte in UTF-8. A
-        // valid continuation byte starts with 10.
-      // range check -
-      uint32_t code_point =
-          (leading_byte & 0b00011111) << 6 |
-          (data[pos + 1] &
-           0b00111111); // assembles the Unicode code point from the two bytes.
-                        // It does this by discarding the leading 110 and 10
-                        // bits from the two bytes, shifting the remaining bits
-                        // of the first byte, and then combining the results
-                        // with a bitwise OR operation.
-      if (code_point < 0x80) {
-        return result(error_code::OVERLONG, pos);
-      }
-      if (0xFF < code_point) {
-        return result(error_code::TOO_LARGE, pos);
-      } // We only care about the range 129-255 which is Non-ASCII latin1
-        // characters
-      *latin_output++ = char(code_point);
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8
-      return result(error_code::TOO_LARGE, pos);
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      return result(error_code::TOO_LARGE, pos);
-    } else {
-      // we either have too many continuation bytes or an invalid leading byte
-      if ((leading_byte & 0b11000000) == 0b10000000) {
-        return result(error_code::TOO_LONG, pos);
-      }
+size_t utf8_to_latin1_avx512(const char *&inbuf, size_t len,
+                             char *&inlatin_output) {
+  const char *buf = inbuf;
+  char *latin_output = inlatin_output;
+  char *start = latin_output;
+  size_t pos = 0;
+  __m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000
+  __m512i one = _mm512_set1_epi8(1);
+  __mmask64 next_leading = 0;
+  __mmask64 next_bit6 = 0;
 
-      return result(error_code::HEADER_BITS, pos);
+  while (pos + 64 <= len) {
+    size_t written = process_block_from_utf8_to_latin1<false>(
+        buf + pos, 64, latin_output, minus64, one, &next_leading, &next_bit6);
+    if (written == 0) {
+      inlatin_output = latin_output;
+      inbuf = buf + pos - next_leading;
+      return 0; // Indicates error at pos or after, or just before pos (too
+                // short error)
     }
+    latin_output += written;
+    pos += 64;
   }
-  return result(error_code::SUCCESS, latin_output - start);
-}
 
-inline result rewind_and_convert_with_errors(size_t prior_bytes,
-                                             const char *buf, size_t len,
-                                             char *latin1_output) {
-  size_t extra_len{0};
-  // We potentially need to go back in time and find a leading byte.
-  // In theory '3' would be sufficient, but sometimes the error can go back
-  // quite far.
-  size_t how_far_back = prior_bytes;
-  // size_t how_far_back = 3; // 3 bytes in the past + current position
-  // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
-  bool found_leading_bytes{false};
-  // important: it is i <= how_far_back and not 'i < how_far_back'.
-  for (size_t i = 0; i <= how_far_back; i++) {
-    unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
-    found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
-    if (found_leading_bytes) {
-      if (i > 0 && byte < 128) {
-        // If we had to go back and the leading byte is ascii
-        // then we can stop right away.
-        return result(error_code::TOO_LONG, 0 - i + 1);
-      }
-      buf -= i;
-      extra_len = i;
-      break;
+  if (pos < len) {
+    size_t remaining = len - pos;
+    size_t written = process_block_from_utf8_to_latin1<true>(
+        buf + pos, remaining, latin_output, minus64, one, &next_leading,
+        &next_bit6);
+    if (written == 0) {
+      inbuf = buf + pos - next_leading;
+      inlatin_output = latin_output;
+      return 0; // Indicates error at pos or after, or just before pos (too
+                // short error)
     }
+    latin_output += written;
   }
-  //
-  // It is possible for this function to return a negative count in its result.
-  // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
-  // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
-  // unsigned integral type of the result of the sizeof operator
-  //
-  // An unsigned type will simply wrap round arithmetically (well defined).
-  //
-  if (!found_leading_bytes) {
-    // If how_far_back == 3, we may have four consecutive continuation bytes!!!
-    // [....] [continuation] [continuation] [continuation] | [buf is
-    // continuation] Or we possibly have a stream that does not start with a
-    // leading byte.
-    return result(error_code::TOO_LONG, 0 - how_far_back);
-  }
-  result res = convert_with_errors(buf, len + extra_len, latin1_output);
-  if (res.error) {
-    res.count -= extra_len;
+  if (next_leading) {
+    inbuf = buf + len - next_leading;
+    inlatin_output = latin_output;
+    return 0; // Indicates error at end of buffer
   }
-  return res;
+  inlatin_output = latin_output;
+  inbuf += len;
+  return size_t(latin_output - start);
 }
+/* end file src/icelake/icelake_convert_utf8_to_latin1.inl.cpp */
+/* begin file src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp */
+// file included directly
 
-} // namespace utf8_to_latin1
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+// File contains conversion procedure from valid UTF-8 strings.
 
-#endif
-/* end file src/scalar/utf8_to_latin1/utf8_to_latin1.h */
-/* begin file src/scalar/utf16_to_latin1/utf16_to_latin1.h */
-#ifndef SIMDUTF_UTF16_TO_LATIN1_H
-#define SIMDUTF_UTF16_TO_LATIN1_H
+template <bool is_remaining>
+simdutf_really_inline size_t process_valid_block_from_utf8_to_latin1(
+    const char *buf, size_t len, char *latin_output, __m512i minus64,
+    __m512i one, __mmask64 *next_leading_ptr, __mmask64 *next_bit6_ptr) {
+  __mmask64 load_mask =
+      is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL;
+  __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf);
+  __mmask64 nonascii = _mm512_movepi8_mask(input);
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf16_to_latin1 {
+  if (nonascii == 0) {
+    is_remaining
+        ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input)
+        : _mm512_storeu_si512((__m512i *)latin_output, input);
+    return len;
+  }
 
-#include <cstring> // for std::memcpy
+  __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64);
 
-template <endianness big_endian>
-inline size_t convert(const char16_t *buf, size_t len, char *latin_output) {
-  if (len == 0) {
-    return 0;
+  __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62));
+
+  *next_leading_ptr = leading >> 63;
+
+  __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one);
+  input =
+      _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64);
+  *next_bit6_ptr = bit6 >> 63;
+
+  __mmask64 retain = ~leading & load_mask;
+  __m512i output = _mm512_maskz_compress_epi8(retain, input);
+  int64_t written_out = count_ones(retain);
+  if (written_out == 0) {
+    return 0; // Indicates error
   }
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  __mmask64 store_mask = ~UINT64_C(0) >> (64 - written_out);
+  // Optimization opportunity: sometimes, masked writes are not needed.
+  _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output);
+  return written_out;
+}
+
+size_t valid_utf8_to_latin1_avx512(const char *buf, size_t len,
+                                   char *latin_output) {
+  char *start = latin_output;
   size_t pos = 0;
-  char *current_write = latin_output;
-  uint16_t word = 0;
-  uint16_t too_large = 0;
+  __m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000
+  __m512i one = _mm512_set1_epi8(1);
+  __mmask64 next_leading = 0;
+  __mmask64 next_bit6 = 0;
 
-  while (pos < len) {
-    word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    too_large |= word;
-    *current_write++ = char(word & 0xFF);
-    pos++;
+  while (pos + 64 <= len) {
+    size_t written = process_valid_block_from_utf8_to_latin1<false>(
+        buf + pos, 64, latin_output, minus64, one, &next_leading, &next_bit6);
+    latin_output += written;
+    pos += 64;
   }
-  if ((too_large & 0xFF00) != 0) {
-    return 0;
+
+  if (pos < len) {
+    size_t remaining = len - pos;
+    size_t written = process_valid_block_from_utf8_to_latin1<true>(
+        buf + pos, remaining, latin_output, minus64, one, &next_leading,
+        &next_bit6);
+    latin_output += written;
   }
 
-  return current_write - latin_output;
+  return (size_t)(latin_output - start);
 }
-
+/* end file src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp */
+/* begin file src/icelake/icelake_convert_utf16_to_latin1.inl.cpp */
+// file included directly
 template <endianness big_endian>
-inline result convert_with_errors(const char16_t *buf, size_t len,
-                                  char *latin_output) {
-  if (len == 0) {
-    return result(error_code::SUCCESS, 0);
+size_t icelake_convert_utf16_to_latin1(const char16_t *buf, size_t len,
+                                       char *latin1_output) {
+  const char16_t *end = buf + len;
+  __m512i v_0xFF = _mm512_set1_epi16(0xff);
+  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  __m512i shufmask = _mm512_set_epi8(
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38,
+      36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+  while (end - buf >= 32) {
+    __m512i in = _mm512_loadu_si512((__m512i *)buf);
+    if (big_endian) {
+      in = _mm512_shuffle_epi8(in, byteflip);
+    }
+    if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
+      return 0;
+    }
+    _mm256_storeu_si256(
+        (__m256i *)latin1_output,
+        _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
+    latin1_output += 32;
+    buf += 32;
   }
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char *start{latin_output};
-  uint16_t word;
-
-  while (pos < len) {
-    if (pos + 16 <= len) { // if it is safe to read 32 more bytes, check that
-                           // they are Latin1
-      uint64_t v1, v2, v3, v4;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      ::memcpy(&v2, data + pos + 4, sizeof(uint64_t));
-      ::memcpy(&v3, data + pos + 8, sizeof(uint64_t));
-      ::memcpy(&v4, data + pos + 12, sizeof(uint64_t));
-
-      if (!match_system(big_endian)) {
-        v1 = (v1 >> 8) | (v1 << (64 - 8));
-      }
-      if (!match_system(big_endian)) {
-        v2 = (v2 >> 8) | (v2 << (64 - 8));
-      }
-      if (!match_system(big_endian)) {
-        v3 = (v3 >> 8) | (v3 << (64 - 8));
-      }
-      if (!match_system(big_endian)) {
-        v4 = (v4 >> 8) | (v4 << (64 - 8));
-      }
-
-      if (((v1 | v2 | v3 | v4) & 0xFF00FF00FF00FF00) == 0) {
-        size_t final_pos = pos + 16;
-        while (pos < final_pos) {
-          *latin_output++ = !match_system(big_endian)
-                                ? char(utf16::swap_bytes(data[pos]))
-                                : char(data[pos]);
-          pos++;
-        }
-        continue;
-      }
+  if (buf < end) {
+    uint32_t mask(uint32_t(1 << (end - buf)) - 1);
+    __m512i in = _mm512_maskz_loadu_epi16(mask, buf);
+    if (big_endian) {
+      in = _mm512_shuffle_epi8(in, byteflip);
     }
-    word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if ((word & 0xFF00) == 0) {
-      *latin_output++ = char(word & 0xFF);
-      pos++;
-    } else {
-      return result(error_code::TOO_LARGE, pos);
+    if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
+      return 0;
     }
+    _mm256_mask_storeu_epi8(
+        latin1_output, mask,
+        _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
   }
-  return result(error_code::SUCCESS, latin_output - start);
+  return len;
 }
 
-} // namespace utf16_to_latin1
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
-
-#endif
-/* end file src/scalar/utf16_to_latin1/utf16_to_latin1.h */
-/* begin file src/scalar/utf32_to_latin1/utf32_to_latin1.h */
-#ifndef SIMDUTF_UTF32_TO_LATIN1_H
-#define SIMDUTF_UTF32_TO_LATIN1_H
-
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf32_to_latin1 {
-
-inline size_t convert(const char32_t *buf, size_t len, char *latin1_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  char *start = latin1_output;
-  uint32_t utf32_char;
-  size_t pos = 0;
-  uint32_t too_large = 0;
-
-  while (pos < len) {
-    utf32_char = (uint32_t)data[pos];
-    too_large |= utf32_char;
-    *latin1_output++ = (char)(utf32_char & 0xFF);
-    pos++;
-  }
-  if ((too_large & 0xFFFFFF00) != 0) {
-    return 0;
+template <endianness big_endian>
+std::pair<result, char *>
+icelake_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
+                                            char *latin1_output) {
+  const char16_t *end = buf + len;
+  const char16_t *start = buf;
+  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  __m512i v_0xFF = _mm512_set1_epi16(0xff);
+  __m512i shufmask = _mm512_set_epi8(
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38,
+      36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+  while (end - buf >= 32) {
+    __m512i in = _mm512_loadu_si512((__m512i *)buf);
+    if (big_endian) {
+      in = _mm512_shuffle_epi8(in, byteflip);
+    }
+    if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
+      uint16_t word;
+      while ((word = (big_endian ? scalar::utf16::swap_bytes(uint16_t(*buf))
+                                 : uint16_t(*buf))) <= 0xff) {
+        *latin1_output++ = uint8_t(word);
+        buf++;
+      }
+      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+                            latin1_output);
+    }
+    _mm256_storeu_si256(
+        (__m256i *)latin1_output,
+        _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
+    latin1_output += 32;
+    buf += 32;
   }
-  return latin1_output - start;
-}
+  if (buf < end) {
+    uint32_t mask(uint32_t(1 << (end - buf)) - 1);
+    __m512i in = _mm512_maskz_loadu_epi16(mask, buf);
+    if (big_endian) {
+      in = _mm512_shuffle_epi8(in, byteflip);
+    }
+    if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
 
-inline result convert_with_errors(const char32_t *buf, size_t len,
-                                  char *latin1_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  char *start{latin1_output};
-  size_t pos = 0;
-  while (pos < len) {
-    if (pos + 2 <=
-        len) { // if it is safe to read 8 more bytes, check that they are Latin1
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0xFFFFFF00FFFFFF00) == 0) {
-        *latin1_output++ = char(buf[pos]);
-        *latin1_output++ = char(buf[pos + 1]);
-        pos += 2;
-        continue;
+      uint16_t word;
+      while ((word = (big_endian ? scalar::utf16::swap_bytes(uint16_t(*buf))
+                                 : uint16_t(*buf))) <= 0xff) {
+        *latin1_output++ = uint8_t(word);
+        buf++;
       }
+      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+                            latin1_output);
     }
-    uint32_t utf32_char = data[pos];
-    if ((utf32_char & 0xFFFFFF00) ==
-        0) { // Check if the character can be represented in Latin-1
-      *latin1_output++ = (char)(utf32_char & 0xFF);
-      pos++;
-    } else {
-      return result(error_code::TOO_LARGE, pos);
-    };
+    _mm256_mask_storeu_epi8(
+        latin1_output, mask,
+        _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
   }
-  return result(error_code::SUCCESS, latin1_output - start);
+  return std::make_pair(result(error_code::SUCCESS, len), latin1_output);
 }
+/* end file src/icelake/icelake_convert_utf16_to_latin1.inl.cpp */
+/* begin file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */
+// file included directly
 
-} // namespace utf32_to_latin1
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
-
-#endif
-/* end file src/scalar/utf32_to_latin1/utf32_to_latin1.h */
-
-/* begin file src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */
-#ifndef SIMDUTF_VALID_UTF8_TO_LATIN1_H
-#define SIMDUTF_VALID_UTF8_TO_LATIN1_H
+/**
+ * This function converts the input (inbuf, inlen), assumed to be valid
+ * UTF16 (little endian) into UTF-8 (to outbuf). The number of code units
+ * written is written to 'outlen' and the function reports the number of input
+ * word consumed.
+ */
+template <endianness big_endian>
+size_t utf16_to_utf8_avx512i(const char16_t *inbuf, size_t inlen,
+                             unsigned char *outbuf, size_t *outlen) {
+  __m512i in;
+  __mmask32 inmask = _cvtu32_mask32(0x7fffffff);
+  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  const char16_t *const inbuf_orig = inbuf;
+  const unsigned char *const outbuf_orig = outbuf;
+  int adjust = 0;
+  int carry = 0;
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf8_to_latin1 {
+  while (inlen >= 32) {
+    in = _mm512_loadu_si512(inbuf);
+    if (big_endian) {
+      in = _mm512_shuffle_epi8(in, byteflip);
+    }
+    inlen -= 31;
+  lastiteration:
+    inbuf += 31;
 
-inline size_t convert_valid(const char *buf, size_t len, char *latin_output) {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  failiteration:
+    const __mmask32 is234byte = _mm512_mask_cmp_epu16_mask(
+        inmask, in, _mm512_set1_epi16(0x0080), _MM_CMPINT_NLT);
 
-  size_t pos = 0;
-  char *start{latin_output};
+    if (_ktestz_mask32_u8(inmask, is234byte)) {
+      // fast path for ASCII only
+      _mm512_mask_cvtepi16_storeu_epi8(outbuf, inmask, in);
+      outbuf += 31;
+      carry = 0;
 
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <=
-        len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 |
-                 v2}; // We are only interested in these bits: 1000 1000 1000
-                      // 1000, so it makes sense to concatenate everything
-      if ((v & 0x8080808080808080) ==
-          0) { // if NONE of these are set, e.g. all of them are zero, then
-               // everything is ASCII
-        size_t final_pos = pos + 16;
-        while (pos < final_pos) {
-          *latin_output++ = char(buf[pos]);
-          pos++;
-        }
+      if (inlen < 32) {
+        goto tail;
+      } else {
         continue;
       }
     }
 
-    // suppose it is not an all ASCII byte sequence
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *latin_output++ = char(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) ==
-               0b11000000) { // the first three bits indicate:
-      // We have a two-byte UTF-8
-      if (pos + 1 >= len) {
-        break;
-      } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return 0;
-      } // checks if the next byte is a valid continuation byte in UTF-8. A
-        // valid continuation byte starts with 10.
-      // range check -
-      uint32_t code_point =
-          (leading_byte & 0b00011111) << 6 |
-          (data[pos + 1] &
-           0b00111111); // assembles the Unicode code point from the two bytes.
-                        // It does this by discarding the leading 110 and 10
-                        // bits from the two bytes, shifting the remaining bits
-                        // of the first byte, and then combining the results
-                        // with a bitwise OR operation.
-      *latin_output++ = char(code_point);
-      pos += 2;
-    } else {
-      // we may have a continuation but we do not do error checking
-      return 0;
-    }
-  }
-  return latin_output - start;
-}
-
-} // namespace utf8_to_latin1
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+    const __mmask32 is12byte =
+        _mm512_cmp_epu16_mask(in, _mm512_set1_epi16(0x0800), _MM_CMPINT_LT);
 
-#endif
-/* end file src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */
-/* begin file src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */
-#ifndef SIMDUTF_VALID_UTF16_TO_LATIN1_H
-#define SIMDUTF_VALID_UTF16_TO_LATIN1_H
+    if (_ktestc_mask32_u8(is12byte, inmask)) {
+      // fast path for 1 and 2 byte only
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf16_to_latin1 {
+      const __m512i twobytes = _mm512_ternarylogic_epi32(
+          _mm512_slli_epi16(in, 8), _mm512_srli_epi16(in, 6),
+          _mm512_set1_epi16(0x3f3f), 0xa8); // (A|B)&C
+      in = _mm512_mask_add_epi16(in, is234byte, twobytes,
+                                 _mm512_set1_epi16(int16_t(0x80c0)));
+      const __m512i cmpmask =
+          _mm512_mask_blend_epi16(inmask, _mm512_set1_epi16(int16_t(0xffff)),
+                                  _mm512_set1_epi16(0x0800));
+      const __mmask64 smoosh =
+          _mm512_cmp_epu8_mask(in, cmpmask, _MM_CMPINT_NLT);
+      const __m512i out = _mm512_maskz_compress_epi8(smoosh, in);
+      _mm512_mask_storeu_epi8(outbuf,
+                              _cvtu64_mask64(_pext_u64(_cvtmask64_u64(smoosh),
+                                                       _cvtmask64_u64(smoosh))),
+                              out);
+      outbuf += 31 + _mm_popcnt_u32(_cvtmask32_u32(is234byte));
+      carry = 0;
 
-template <endianness big_endian>
-inline size_t convert_valid(const char16_t *buf, size_t len,
-                            char *latin_output) {
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char *start{latin_output};
-  uint16_t word = 0;
+      if (inlen < 32) {
+        goto tail;
+      } else {
+        continue;
+      }
+    }
+    __m512i lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
+    __m512i hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
 
-  while (pos < len) {
-    word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    *latin_output++ = char(word);
-    pos++;
-  }
+    __m512i taglo = _mm512_set1_epi32(0x8080e000);
+    __m512i taghi = taglo;
 
-  return latin_output - start;
-}
+    const __m512i fc00masked =
+        _mm512_and_epi32(in, _mm512_set1_epi16(int16_t(0xfc00)));
+    const __mmask32 hisurr = _mm512_mask_cmp_epu16_mask(
+        inmask, fc00masked, _mm512_set1_epi16(int16_t(0xd800)), _MM_CMPINT_EQ);
+    const __mmask32 losurr = _mm512_cmp_epu16_mask(
+        fc00masked, _mm512_set1_epi16(int16_t(0xdc00)), _MM_CMPINT_EQ);
 
-} // namespace utf16_to_latin1
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+    int carryout = 0;
+    if (!_kortestz_mask32_u8(hisurr, losurr)) {
+      // handle surrogates
 
-#endif
-/* end file src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */
-/* begin file src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */
-#ifndef SIMDUTF_VALID_UTF32_TO_LATIN1_H
-#define SIMDUTF_VALID_UTF32_TO_LATIN1_H
+      __m512i los = _mm512_alignr_epi32(hi, lo, 1);
+      __m512i his = _mm512_alignr_epi32(lo, hi, 1);
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf32_to_latin1 {
+      const __mmask32 hisurrhi = _kshiftri_mask32(hisurr, 16);
+      taglo = _mm512_mask_mov_epi32(taglo, __mmask16(hisurr),
+                                    _mm512_set1_epi32(0x808080f0));
+      taghi = _mm512_mask_mov_epi32(taghi, __mmask16(hisurrhi),
+                                    _mm512_set1_epi32(0x808080f0));
 
-inline size_t convert_valid(const char32_t *buf, size_t len,
-                            char *latin1_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  char *start = latin1_output;
-  uint32_t utf32_char;
-  size_t pos = 0;
+      lo = _mm512_mask_slli_epi32(lo, __mmask16(hisurr), lo, 10);
+      hi = _mm512_mask_slli_epi32(hi, __mmask16(hisurrhi), hi, 10);
+      los = _mm512_add_epi32(los, _mm512_set1_epi32(0xfca02400));
+      his = _mm512_add_epi32(his, _mm512_set1_epi32(0xfca02400));
+      lo = _mm512_mask_add_epi32(lo, __mmask16(hisurr), lo, los);
+      hi = _mm512_mask_add_epi32(hi, __mmask16(hisurrhi), hi, his);
 
-  while (pos < len) {
-    utf32_char = (uint32_t)data[pos];
+      carryout = _cvtu32_mask32(_kshiftri_mask32(hisurr, 30));
 
-    if (pos + 2 <=
-        len) { // if it is safe to read 8 more bytes, check that they are Latin1
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0xFFFFFF00FFFFFF00) == 0) {
-        *latin1_output++ = char(buf[pos]);
-        *latin1_output++ = char(buf[pos + 1]);
-        pos += 2;
-        continue;
-      } else {
-        // output can not be represented in latin1
-        return 0;
+      const uint32_t h = _cvtmask32_u32(hisurr);
+      const uint32_t l = _cvtmask32_u32(losurr);
+      // check for mismatched surrogates
+      if ((h + h + carry) ^ l) {
+        const uint32_t lonohi = l & ~(h + h + carry);
+        const uint32_t hinolo = h & ~(l >> 1);
+        inlen = _tzcnt_u32(hinolo | lonohi);
+        inmask = __mmask32(0x7fffffff & ((1U << inlen) - 1));
+        in = _mm512_maskz_mov_epi16(inmask, in);
+        adjust = (int)inlen - 31;
+        inlen = 0;
+        goto failiteration;
       }
     }
-    if ((utf32_char & 0xFFFFFF00) == 0) {
-      *latin1_output++ = char(utf32_char);
-    } else {
-      // output can not be represented in latin1
-      return 0;
-    }
-    pos++;
-  }
-  return latin1_output - start;
-}
 
-} // namespace utf32_to_latin1
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+    hi = _mm512_maskz_mov_epi32(_cvtu32_mask16(0x7fff), hi);
+    carry = carryout;
 
-#endif
-/* end file src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */
+    __m512i mslo =
+        _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), lo);
 
-SIMDUTF_PUSH_DISABLE_WARNINGS
-SIMDUTF_DISABLE_UNDESIRED_WARNINGS
+    __m512i mshi =
+        _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), hi);
 
-#if SIMDUTF_IMPLEMENTATION_ARM64
-/* begin file src/arm64/implementation.cpp */
-/* begin file src/simdutf/arm64/begin.h */
-// redefining SIMDUTF_IMPLEMENTATION to "arm64"
-// #define SIMDUTF_IMPLEMENTATION arm64
-/* end file src/simdutf/arm64/begin.h */
-namespace simdutf {
-namespace arm64 {
-namespace {
-#ifndef SIMDUTF_ARM64_H
-  #error "arm64.h must be included"
-#endif
-using namespace simd;
+    const __mmask32 outmask = __mmask32(_kandn_mask64(losurr, inmask));
+    const __mmask64 outmhi = _kshiftri_mask64(outmask, 16);
 
-simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
-  simd8<uint8_t> bits = input.reduce_or();
-  return bits.max_val() < 0b10000000u;
-}
+    const __mmask32 is1byte = __mmask32(_knot_mask64(is234byte));
+    const __mmask64 is1bhi = _kshiftri_mask64(is1byte, 16);
+    const __mmask64 is12bhi = _kshiftri_mask64(is12byte, 16);
 
-simdutf_unused simdutf_really_inline simd8<bool>
-must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2,
-                     const simd8<uint8_t> prev3) {
-  simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
-  simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
-  simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
-  // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller
-  // is using ^ as well. This will work fine because we only have to report
-  // errors for cases with 0-1 lead bytes. Multiple lead bytes implies 2
-  // overlapping multibyte characters, and if that happens, there is guaranteed
-  // to be at least *one* lead byte that is part of only 1 other multibyte
-  // character. The error will be detected there.
-  return is_second_byte ^ is_third_byte ^ is_fourth_byte;
-}
+    taglo = _mm512_mask_mov_epi32(taglo, __mmask16(is12byte),
+                                  _mm512_set1_epi32(0x80c00000));
+    taghi = _mm512_mask_mov_epi32(taghi, __mmask16(is12bhi),
+                                  _mm512_set1_epi32(0x80c00000));
+    __m512i magiclo = _mm512_mask_blend_epi32(__mmask16(outmask),
+                                              _mm512_set1_epi32(0xffffffff),
+                                              _mm512_set1_epi32(0x00010101));
+    __m512i magichi = _mm512_mask_blend_epi32(__mmask16(outmhi),
+                                              _mm512_set1_epi32(0xffffffff),
+                                              _mm512_set1_epi32(0x00010101));
 
-simdutf_really_inline simd8<bool>
-must_be_2_3_continuation(const simd8<uint8_t> prev2,
-                         const simd8<uint8_t> prev3) {
-  simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
-  simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
-  return is_third_byte ^ is_fourth_byte;
-}
+    magiclo = _mm512_mask_blend_epi32(__mmask16(outmask),
+                                      _mm512_set1_epi32(0xffffffff),
+                                      _mm512_set1_epi32(0x00010101));
+    magichi = _mm512_mask_blend_epi32(__mmask16(outmhi),
+                                      _mm512_set1_epi32(0xffffffff),
+                                      _mm512_set1_epi32(0x00010101));
 
-// common functions for utf8 conversions
-simdutf_really_inline uint16x4_t convert_utf8_3_byte_to_utf16(uint8x16_t in) {
-  // Low half contains  10cccccc|1110aaaa
-  // High half contains 10bbbbbb|10bbbbbb
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  const uint8x16_t sh = simdutf_make_uint8x16_t(0, 2, 3, 5, 6, 8, 9, 11, 1, 1,
-                                                4, 4, 7, 7, 10, 10);
-#else
-  const uint8x16_t sh = {0, 2, 3, 5, 6, 8, 9, 11, 1, 1, 4, 4, 7, 7, 10, 10};
-#endif
-  uint8x16_t perm = vqtbl1q_u8(in, sh);
-  // Split into half vectors.
-  // 10cccccc|1110aaaa
-  uint8x8_t perm_low = vget_low_u8(perm); // no-op
-  // 10bbbbbb|10bbbbbb
-  uint8x8_t perm_high = vget_high_u8(perm);
-  // xxxxxxxx 10bbbbbb
-  uint16x4_t mid = vreinterpret_u16_u8(perm_high); // no-op
-  // xxxxxxxx 1110aaaa
-  uint16x4_t high = vreinterpret_u16_u8(perm_low); // no-op
-  // Assemble with shift left insert.
-  // xxxxxxaa aabbbbbb
-  uint16x4_t mid_high = vsli_n_u16(mid, high, 6);
-  // (perm_low << 8) | (perm_low >> 8)
-  // xxxxxxxx 10cccccc
-  uint16x4_t low = vreinterpret_u16_u8(vrev16_u8(perm_low));
-  // Shift left insert into the low bits
-  // aaaabbbb bbcccccc
-  uint16x4_t composed = vsli_n_u16(low, mid_high, 6);
-  return composed;
-}
+    mslo = _mm512_ternarylogic_epi32(mslo, _mm512_set1_epi32(0x3f3f3f3f), taglo,
+                                     0xea); // A&B|C
+    mshi = _mm512_ternarylogic_epi32(mshi, _mm512_set1_epi32(0x3f3f3f3f), taghi,
+                                     0xea);
+    mslo = _mm512_mask_slli_epi32(mslo, __mmask16(is1byte), lo, 24);
 
-simdutf_really_inline uint16x8_t convert_utf8_2_byte_to_utf16(uint8x16_t in) {
-  // Converts 6 2 byte UTF-8 characters to 6 UTF-16 characters.
-  // Technically this calculates 8, but 6 does better and happens more often
-  // (The languages which use these codepoints use ASCII spaces so 8 would need
-  // to be in the middle of a very long word).
+    mshi = _mm512_mask_slli_epi32(mshi, __mmask16(is1bhi), hi, 24);
 
-  // 10bbbbbb 110aaaaa
-  uint16x8_t upper = vreinterpretq_u16_u8(in);
-  // (in << 8) | (in >> 8)
-  // 110aaaaa 10bbbbbb
-  uint16x8_t lower = vreinterpretq_u16_u8(vrev16q_u8(in));
-  // 00000000 000aaaaa
-  uint16x8_t upper_masked = vandq_u16(upper, vmovq_n_u16(0x1F));
-  // Assemble with shift left insert.
-  // 00000aaa aabbbbbb
-  uint16x8_t composed = vsliq_n_u16(lower, upper_masked, 6);
-  return composed;
-}
+    const __mmask64 wantlo =
+        _mm512_cmp_epu8_mask(mslo, magiclo, _MM_CMPINT_NLT);
+    const __mmask64 wanthi =
+        _mm512_cmp_epu8_mask(mshi, magichi, _MM_CMPINT_NLT);
+    const __m512i outlo = _mm512_maskz_compress_epi8(wantlo, mslo);
+    const __m512i outhi = _mm512_maskz_compress_epi8(wanthi, mshi);
+    const uint64_t wantlo_uint64 = _cvtmask64_u64(wantlo);
+    const uint64_t wanthi_uint64 = _cvtmask64_u64(wanthi);
 
-simdutf_really_inline uint16x8_t
-convert_utf8_1_to_2_byte_to_utf16(uint8x16_t in, size_t shufutf8_idx) {
-  // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters.
-  // This is a relatively easy scenario
-  // we process SIX (6) input code-code units. The max length in bytes of six
-  // code code units spanning between 1 and 2 bytes each is 12 bytes.
-  uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
-      simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx]));
-  // Shuffle
-  // 1 byte: 00000000 0bbbbbbb
-  // 2 byte: 110aaaaa 10bbbbbb
-  uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh));
-  // Mask
-  // 1 byte: 00000000 0bbbbbbb
-  // 2 byte: 00000000 00bbbbbb
-  uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits
-  // 1 byte: 00000000 00000000
-  // 2 byte: 000aaaaa 00000000
-  uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits
-  // Combine with a shift right accumulate
-  // 1 byte: 00000000 0bbbbbbb
-  // 2 byte: 00000aaa aabbbbbb
-  uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2);
-  return composed;
+    uint64_t advlo = _mm_popcnt_u64(wantlo_uint64);
+    uint64_t advhi = _mm_popcnt_u64(wanthi_uint64);
+
+    _mm512_mask_storeu_epi8(
+        outbuf, _cvtu64_mask64(_pext_u64(wantlo_uint64, wantlo_uint64)), outlo);
+    _mm512_mask_storeu_epi8(
+        outbuf + advlo, _cvtu64_mask64(_pext_u64(wanthi_uint64, wanthi_uint64)),
+        outhi);
+    outbuf += advlo + advhi;
+  }
+  outbuf += -adjust;
+
+tail:
+  if (inlen != 0) {
+    // We must have inlen < 31.
+    inmask = _cvtu32_mask32((1U << inlen) - 1);
+    in = _mm512_maskz_loadu_epi16(inmask, inbuf);
+    if (big_endian) {
+      in = _mm512_shuffle_epi8(in, byteflip);
+    }
+    adjust = (int)inlen - 31;
+    inlen = 0;
+    goto lastiteration;
+  }
+  *outlen = (outbuf - outbuf_orig) + adjust;
+  return ((inbuf - inbuf_orig) + adjust);
 }
+/* end file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */
+/* begin file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */
+// file included directly
 
-/* begin file src/arm64/arm_validate_utf16.cpp */
+/*
+  Returns a pair: the first unprocessed byte from buf and utf32_output
+  A scalar routing should carry on the conversion of the tail.
+*/
 template <endianness big_endian>
-const char16_t *arm_validate_utf16(const char16_t *input, size_t size) {
-  const char16_t *end = input + size;
-  const auto v_d8 = simd8<uint8_t>::splat(0xd8);
-  const auto v_f8 = simd8<uint8_t>::splat(0xf8);
-  const auto v_fc = simd8<uint8_t>::splat(0xfc);
-  const auto v_dc = simd8<uint8_t>::splat(0xdc);
-  while (end - input >= 16) {
-    // 0. Load data: since the validation takes into account only higher
-    //    byte of each word, we compress the two vectors into one which
-    //    consists only the higher bytes.
-    auto in0 = simd16<uint16_t>(input);
-    auto in1 =
-        simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
-    if (!match_system(big_endian)) {
-      in0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in0)));
-      in1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in1)));
+std::tuple<const char16_t *, char32_t *, bool>
+convert_utf16_to_utf32(const char16_t *buf, size_t len,
+                       char32_t *utf32_output) {
+  const char16_t *end = buf + len;
+  const __m512i v_fc00 = _mm512_set1_epi16((uint16_t)0xfc00);
+  const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
+  const __m512i v_dc00 = _mm512_set1_epi16((uint16_t)0xdc00);
+  __mmask32 carry{0};
+  const __m512i byteflip = _mm512_setr_epi64(
+      0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
+      0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+      0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  while (std::distance(buf, end) >= 32) {
+    // Always safe because buf + 32 <= end so that end - buf >= 32 bytes:
+    __m512i in = _mm512_loadu_si512((__m512i *)buf);
+    if (big_endian) {
+      in = _mm512_shuffle_epi8(in, byteflip);
     }
-    const auto t0 = in0.shr<8>();
-    const auto t1 = in1.shr<8>();
-    const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
-    // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
-    const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
-    if (surrogates_wordmask == 0) {
-      input += 16;
-    } else {
-      // 2. We have some surrogates that have to be distinguished:
-      //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
-      //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
-      //
-      //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
 
-      // V - non-surrogate code units
-      //     V = not surrogates_wordmask
-      const uint64_t V = ~surrogates_wordmask;
+    // H - bitmask for high surrogates
+    const __mmask32 H =
+        _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_d800);
+    // H - bitmask for low surrogates
+    const __mmask32 L =
+        _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_dc00);
 
-      // H - word-mask for high surrogates: the six highest bits are 0b1101'11
-      const auto vH = ((in & v_fc) == v_dc);
-      const uint64_t H = vH.to_bitmask64();
+    if ((H | L)) {
+      // surrogate pair(s) in a register
+      const __mmask32 V =
+          (L ^
+           (carry | (H << 1))); // A high surrogate must be followed by low one
+                                // and a low one must be preceded by a high one.
+                                // If valid, V should be equal to 0
 
-      // L - word mask for low surrogates
-      //     L = not H and surrogates_wordmask
-      const uint64_t L = ~H & surrogates_wordmask;
+      if (V == 0) {
+        // valid case
+        /*
+            Input surrogate pair:
+            |1101.11aa.aaaa.aaaa|1101.10bb.bbbb.bbbb|
+                low surrogate      high surrogate
+        */
+        /*  1. Expand all code units to 32-bit code units
+            in
+           |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
+        */
+        const __m512i first = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
+        const __m512i second =
+            _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
 
-      const uint64_t a =
-          L & (H >> 4); // A low surrogate must be followed by high one.
-                        // (A low surrogate placed in the 7th register's word
-                        // is an exception we handle.)
-      const uint64_t b =
-          a << 4; // Just mark that the opposite fact is hold,
-                  // thanks to that we have only two masks for valid case.
-      const uint64_t c = V | a | b; // Combine all the masks into the final one.
-      if (c == ~0ull) {
-        // The whole input register contains valid UTF-16, i.e.,
-        // either single code units or proper surrogate pairs.
-        input += 16;
-      } else if (c == 0xfffffffffffffffull) {
-        // The 15 lower code units of the input register contains valid UTF-16.
-        // The 15th word may be either a low or high surrogate. It the next
-        // iteration we 1) check if the low surrogate is followed by a high
-        // one, 2) reject sole high surrogate.
-        input += 15;
+        /*  2. Shift by one 16-bit word to align low surrogates with high
+           surrogates in
+           |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
+            shifted
+           |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
+        */
+        const __m512i shifted_first = _mm512_alignr_epi32(second, first, 1);
+        const __m512i shifted_second =
+            _mm512_alignr_epi32(_mm512_setzero_si512(), second, 1);
+
+        /*  3. Align all high surrogates in first and second by shifting to the
+           left by 10 bits
+            |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
+        */
+        const __m512i aligned_first =
+            _mm512_mask_slli_epi32(first, (__mmask16)H, first, 10);
+        const __m512i aligned_second =
+            _mm512_mask_slli_epi32(second, (__mmask16)(H >> 16), second, 10);
+
+        /*  4. Remove surrogate prefixes and add offset 0x10000 by adding in,
+           shifted and constant in
+           |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
+            shifted
+           |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
+            constant|1111.1100.1010.0000.0010.0100.0000.0000|1111.1100.1010.0000.0010.0100.0000.0000|
+        */
+        const __m512i constant = _mm512_set1_epi32((uint32_t)0xfca02400);
+        const __m512i added_first = _mm512_mask_add_epi32(
+            aligned_first, (__mmask16)H, aligned_first, shifted_first);
+        const __m512i utf32_first = _mm512_mask_add_epi32(
+            added_first, (__mmask16)H, added_first, constant);
+
+        const __m512i added_second =
+            _mm512_mask_add_epi32(aligned_second, (__mmask16)(H >> 16),
+                                  aligned_second, shifted_second);
+        const __m512i utf32_second = _mm512_mask_add_epi32(
+            added_second, (__mmask16)(H >> 16), added_second, constant);
+
+        //  5. Store all valid UTF-32 code units (low surrogate positions and
+        //  32nd word are invalid)
+        const __mmask32 valid = ~L & 0x7fffffff;
+        // We deliberately do a _mm512_maskz_compress_epi32 followed by
+        // storeu_epi32 to ease performance portability to Zen 4.
+        const __m512i compressed_first =
+            _mm512_maskz_compress_epi32((__mmask16)(valid), utf32_first);
+        const size_t howmany1 = count_ones((uint16_t)(valid));
+        _mm512_storeu_si512((__m512i *)utf32_output, compressed_first);
+        utf32_output += howmany1;
+        const __m512i compressed_second =
+            _mm512_maskz_compress_epi32((__mmask16)(valid >> 16), utf32_second);
+        const size_t howmany2 = count_ones((uint16_t)(valid >> 16));
+        // The following could be unsafe in some cases?
+        //_mm512_storeu_epi32((__m512i *) utf32_output, compressed_second);
+        _mm512_mask_storeu_epi32((__m512i *)utf32_output,
+                                 __mmask16((1 << howmany2) - 1),
+                                 compressed_second);
+        utf32_output += howmany2;
+        // Only process 31 code units, but keep track if the 31st word is a high
+        // surrogate as a carry
+        buf += 31;
+        carry = (H >> 30) & 0x1;
       } else {
-        return nullptr;
+        // invalid case
+        return std::make_tuple(buf + carry, utf32_output, false);
+      }
+    } else {
+      // no surrogates
+      // extend all thirty-two 16-bit code units to thirty-two 32-bit code units
+      _mm512_storeu_si512((__m512i *)(utf32_output),
+                          _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)));
+      _mm512_storeu_si512(
+          (__m512i *)(utf32_output) + 1,
+          _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1)));
+      utf32_output += 32;
+      buf += 32;
+      carry = 0;
+    }
+  } // while
+  return std::make_tuple(buf + carry, utf32_output, true);
+}
+/* end file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */
+/* begin file src/icelake/icelake_convert_utf32_to_latin1.inl.cpp */
+// file included directly
+size_t icelake_convert_utf32_to_latin1(const char32_t *buf, size_t len,
+                                       char *latin1_output) {
+  const char32_t *end = buf + len;
+  __m512i v_0xFF = _mm512_set1_epi32(0xff);
+  __m512i shufmask = _mm512_set_epi8(
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60,
+      56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0);
+  while (end - buf >= 16) {
+    __m512i in = _mm512_loadu_si512((__m512i *)buf);
+    if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
+      return 0;
+    }
+    _mm_storeu_si128(
+        (__m128i *)latin1_output,
+        _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
+    latin1_output += 16;
+    buf += 16;
+  }
+  if (buf < end) {
+    uint16_t mask = uint16_t((1 << (end - buf)) - 1);
+    __m512i in = _mm512_maskz_loadu_epi32(mask, buf);
+    if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
+      return 0;
+    }
+    _mm_mask_storeu_epi8(
+        latin1_output, mask,
+        _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
+  }
+  return len;
+}
+
+std::pair<result, char *>
+icelake_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+                                            char *latin1_output) {
+  const char32_t *end = buf + len;
+  const char32_t *start = buf;
+  __m512i v_0xFF = _mm512_set1_epi32(0xff);
+  __m512i shufmask = _mm512_set_epi8(
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60,
+      56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0);
+  while (end - buf >= 16) {
+    __m512i in = _mm512_loadu_si512((__m512i *)buf);
+    if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
+      while (uint32_t(*buf) <= 0xff) {
+        *latin1_output++ = uint8_t(*buf++);
       }
+      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+                            latin1_output);
     }
+    _mm_storeu_si128(
+        (__m128i *)latin1_output,
+        _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
+    latin1_output += 16;
+    buf += 16;
   }
-  return input;
+  if (buf < end) {
+    uint16_t mask = uint16_t((1 << (end - buf)) - 1);
+    __m512i in = _mm512_maskz_loadu_epi32(mask, buf);
+    if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
+      while (uint32_t(*buf) <= 0xff) {
+        *latin1_output++ = uint8_t(*buf++);
+      }
+      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+                            latin1_output);
+    }
+    _mm_mask_storeu_epi8(
+        latin1_output, mask,
+        _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
+  }
+  return std::make_pair(result(error_code::SUCCESS, len), latin1_output);
 }
+/* end file src/icelake/icelake_convert_utf32_to_latin1.inl.cpp */
+/* begin file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */
+// file included directly
 
-template <endianness big_endian>
-const result arm_validate_utf16_with_errors(const char16_t *input,
-                                            size_t size) {
-  const char16_t *start = input;
-  const char16_t *end = input + size;
+// Todo: currently, this is just the haswell code, optimize for icelake kernel.
+std::pair<const char32_t *, char *>
+avx512_convert_utf32_to_utf8(const char32_t *buf, size_t len,
+                             char *utf8_output) {
+  const char32_t *end = buf + len;
+  const __m256i v_0000 = _mm256_setzero_si256();
+  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+  __m256i running_max = _mm256_setzero_si256();
+  __m256i forbidden_bytemask = _mm256_setzero_si256();
 
-  const auto v_d8 = simd8<uint8_t>::splat(0xd8);
-  const auto v_f8 = simd8<uint8_t>::splat(0xf8);
-  const auto v_fc = simd8<uint8_t>::splat(0xfc);
-  const auto v_dc = simd8<uint8_t>::splat(0xdc);
-  while (input + 16 < end) {
-    // 0. Load data: since the validation takes into account only higher
-    //    byte of each word, we compress the two vectors into one which
-    //    consists only the higher bytes.
-    auto in0 = simd16<uint16_t>(input);
-    auto in1 =
-        simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
 
-    if (!match_system(big_endian)) {
-      in0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in0)));
-      in1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in1)));
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+    __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
+    running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
+
+    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
+    // saturation
+    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
+                                        _mm256_and_si256(nextin, v_7fffffff));
+    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+
+    // Try to apply UTF-16 => UTF-8 routine on 256 bits
+    // (haswell/avx2_convert_utf16_to_utf8.cpp)
+
+    if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+      // 1. pack the bytes
+      const __m128i utf8_packed = _mm_packus_epi16(
+          _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+      // 2. store (16 bytes)
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+      // 3. adjust pointers
+      buf += 16;
+      utf8_output += 16;
+      continue; // we are done for this round!
     }
-    const auto t0 = in0.shr<8>();
-    const auto t1 = in1.shr<8>();
-    const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
-    // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
-    const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
-    if (surrogates_wordmask == 0) {
-      input += 16;
-    } else {
-      // 2. We have some surrogates that have to be distinguished:
-      //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
-      //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
-      //
-      //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+    // no bits set above 7th bit
+    const __m256i one_byte_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+    const uint32_t one_byte_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
 
-      // V - non-surrogate code units
-      //     V = not surrogates_wordmask
-      const uint64_t V = ~surrogates_wordmask;
+    // no bits set above 11th bit
+    const __m256i one_or_two_bytes_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+    const uint32_t one_or_two_bytes_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+    if (one_or_two_bytes_bitmask == 0xffffffff) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
 
-      // H - word-mask for high surrogates: the six highest bits are 0b1101'11
-      const auto vH = ((in & v_fc) == v_dc);
-      const uint64_t H = vH.to_bitmask64();
+      // t0 = [000a|aaaa|bbbb|bb00]
+      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+      // t2 = [0000|0000|00bb|bbbb]
+      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+      // t3 = [000a|aaaa|00bb|bbbb]
+      const __m256i t3 = _mm256_or_si256(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      const __m256i t4 = _mm256_or_si256(t3, v_c080);
 
-      // L - word mask for low surrogates
-      //     L = not H and surrogates_wordmask
-      const uint64_t L = ~H & surrogates_wordmask;
+      // 2. merge ASCII and 2-byte codewords
+      const __m256i utf8_unpacked =
+          _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
 
-      const uint64_t a =
-          L & (H >> 4); // A low surrogate must be followed by high one.
-                        // (A low surrogate placed in the 7th register's word
-                        // is an exception we handle.)
-      const uint64_t b =
-          a << 4; // Just mark that the opposite fact is hold,
-                  // thanks to that we have only two masks for valid case.
-      const uint64_t c = V | a | b; // Combine all the masks into the final one.
-      if (c == ~0ull) {
-        // The whole input register contains valid UTF-16, i.e.,
-        // either single code units or proper surrogate pairs.
-        input += 16;
-      } else if (c == 0xfffffffffffffffull) {
-        // The 15 lower code units of the input register contains valid UTF-16.
-        // The 15th word may be either a low or high surrogate. It the next
-        // iteration we 1) check if the low surrogate is followed by a high
-        // one, 2) reject sole high surrogate.
-        input += 15;
-      } else {
-        return result(error_code::SURROGATE, input - start);
-      }
-    }
-  }
-  return result(error_code::SUCCESS, input - start);
-}
-/* end file src/arm64/arm_validate_utf16.cpp */
-/* begin file src/arm64/arm_validate_utf32le.cpp */
+      // 3. prepare bitmask for 8-bit lookup
+      const uint32_t M0 = one_byte_bitmask & 0x55555555;
+      const uint32_t M1 = M0 >> 7;
+      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+      // 4. pack the bytes
 
-const char32_t *arm_validate_utf32le(const char32_t *input, size_t size) {
-  const char32_t *end = input + size;
+      const uint8_t *row =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+      const uint8_t *row_2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
+                                                                       16)][0];
 
-  const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
-  const uint32x4_t offset = vmovq_n_u32(0xffff2000);
-  const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
-  uint32x4_t currentmax = vmovq_n_u32(0x0);
-  uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
+      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
 
-  while (end - input >= 4) {
-    const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input));
-    currentmax = vmaxq_u32(in, currentmax);
-    currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
-    input += 4;
-  }
+      const __m256i utf8_packed = _mm256_shuffle_epi8(
+          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+      // 5. store bytes
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_castsi256_si128(utf8_packed));
+      utf8_output += row[0];
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_extractf128_si256(utf8_packed, 1));
+      utf8_output += row_2[0];
 
-  uint32x4_t is_zero =
-      veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
-  if (vmaxvq_u32(is_zero) != 0) {
-    return nullptr;
-  }
+      // 6. adjust pointers
+      buf += 16;
+      continue;
+    }
+    // Must check for overflow in packing
+    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
+        _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+    const uint32_t saturation_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+    if (saturation_bitmask == 0xffffffff) {
+      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+      forbidden_bytemask = _mm256_or_si256(
+          forbidden_bytemask,
+          _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
 
-  is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax),
-                      standardoffsetmax);
-  if (vmaxvq_u32(is_zero) != 0) {
-    return nullptr;
-  }
+      const __m256i dup_even = _mm256_setr_epi16(
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
 
-  return input;
-}
+      /* In this branch we handle three cases:
+        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+        single UFT-8 byte
+        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
+        UTF-8 bytes
+        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+        three UTF-8 bytes
 
-const result arm_validate_utf32le_with_errors(const char32_t *input,
-                                              size_t size) {
-  const char32_t *start = input;
-  const char32_t *end = input + size;
+        We expand the input word (16-bit) into two code units (32-bit), thus
+        we have room for four bytes. However, we need five distinct bit
+        layouts. Note that the last byte in cases #2 and #3 is the same.
 
-  const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
-  const uint32x4_t offset = vmovq_n_u32(0xffff2000);
-  const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
-  uint32x4_t currentmax = vmovq_n_u32(0x0);
-  uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
+        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+        in register t2.
 
-  while (end - input >= 4) {
-    const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input));
-    currentmax = vmaxq_u32(in, currentmax);
-    currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
+        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+        either byte 1 for case #2 or byte 2 for case #3. Note that they
+        differ by exactly one bit.
 
-    uint32x4_t is_zero =
-        veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
-    if (vmaxvq_u32(is_zero) != 0) {
-      return result(error_code::TOO_LARGE, input - start);
-    }
+        Finally from these two code units we build proper UTF-8 sequence, taking
+        into account the case (i.e, the number of bytes to write).
+      */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
 
-    is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax),
-                        standardoffsetmax);
-    if (vmaxvq_u32(is_zero) != 0) {
-      return result(error_code::SURROGATE, input - start);
-    }
+      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
+                                             simdutf_vec(0b0100000000000000));
+      const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
 
-    input += 4;
-  }
+      // 4. expand code units 16-bit => 32-bit
+      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
 
-  return result(error_code::SUCCESS, input - start);
-}
-/* end file src/arm64/arm_validate_utf32le.cpp */
+      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+      // Due to the wider registers, the following path is less likely to be
+      // useful.
+      /*if(mask == 0) {
+        // We only have three-byte code units. Use fast path.
+        const __m256i shuffle =
+      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
+      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
+      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
+      _mm256_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+        continue;
+      }*/
+      const uint8_t mask0 = uint8_t(mask);
+      const uint8_t *row0 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+      const __m128i utf8_0 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
 
-/* begin file src/arm64/arm_convert_latin1_to_utf16.cpp */
-template <endianness big_endian>
-std::pair<const char *, char16_t *>
-arm_convert_latin1_to_utf16(const char *buf, size_t len,
-                            char16_t *utf16_output) {
-  const char *end = buf + len;
+      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+      const uint8_t *row1 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+      const __m128i utf8_1 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
 
-  while (end - buf >= 16) {
-    uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(buf));
-    uint16x8_t inlow = vmovl_u8(vget_low_u8(in8));
-    if (!match_system(big_endian)) {
-      inlow = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(inlow)));
-    }
-    vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), inlow);
-    uint16x8_t inhigh = vmovl_u8(vget_high_u8(in8));
-    if (!match_system(big_endian)) {
-      inhigh = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(inhigh)));
+      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+      const uint8_t *row2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
+      const __m128i utf8_2 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+      const uint8_t *row3 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
+      const __m128i utf8_3 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+      utf8_output += row0[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+      utf8_output += row1[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
+      utf8_output += row2[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
+      utf8_output += row3[0];
+      buf += 16;
+    } else {
+      // case: at least one 32-bit word is larger than 0xFFFF <=> it will
+      // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
+      // wasteful to use scalar code, but being efficient with SIMD may require
+      // large, non-trivial tables?
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+          *utf8_output++ = char(word);
+        } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(nullptr, utf8_output);
+          }
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else { // 4-byte
+          if (word > 0x10FFFF) {
+            return std::make_pair(nullptr, utf8_output);
+          }
+          *utf8_output++ = char((word >> 18) | 0b11110000);
+          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
     }
-    vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output + 8), inhigh);
-    utf16_output += 16;
-    buf += 16;
+  } // while
+
+  // check for invalid input
+  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+  if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(
+          _mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
+    return std::make_pair(nullptr, utf8_output);
   }
 
-  return std::make_pair(buf, utf16_output);
+  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+    return std::make_pair(nullptr, utf8_output);
+  }
+
+  return std::make_pair(buf, utf8_output);
 }
-/* end file src/arm64/arm_convert_latin1_to_utf16.cpp */
-/* begin file src/arm64/arm_convert_latin1_to_utf32.cpp */
-std::pair<const char *, char32_t *>
-arm_convert_latin1_to_utf32(const char *buf, size_t len,
-                            char32_t *utf32_output) {
-  const char *end = buf + len;
 
-  while (end - buf >= 16) {
-    uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(buf));
-    uint16x8_t in8low = vmovl_u8(vget_low_u8(in8));
-    uint32x4_t in16lowlow = vmovl_u16(vget_low_u16(in8low));
-    uint32x4_t in16lowhigh = vmovl_u16(vget_high_u16(in8low));
-    uint16x8_t in8high = vmovl_u8(vget_high_u8(in8));
-    uint32x4_t in8highlow = vmovl_u16(vget_low_u16(in8high));
-    uint32x4_t in8highhigh = vmovl_u16(vget_high_u16(in8high));
-    vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output), in16lowlow);
-    vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output + 4), in16lowhigh);
-    vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output + 8), in8highlow);
-    vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output + 12), in8highhigh);
+// Todo: currently, this is just the haswell code, optimize for icelake kernel.
+std::pair<result, char *>
+avx512_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
+                                         char *utf8_output) {
+  const char32_t *end = buf + len;
+  const char32_t *start = buf;
 
-    utf32_output += 16;
-    buf += 16;
-  }
+  const __m256i v_0000 = _mm256_setzero_si256();
+  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
 
-  return std::make_pair(buf, utf32_output);
-}
-/* end file src/arm64/arm_convert_latin1_to_utf32.cpp */
-/* begin file src/arm64/arm_convert_latin1_to_utf8.cpp */
-/*
-  Returns a pair: the first unprocessed byte from buf and utf8_output
-  A scalar routing should carry on the conversion of the tail.
-*/
-std::pair<const char *, char *>
-arm_convert_latin1_to_utf8(const char *latin1_input, size_t len,
-                           char *utf8_out) {
-  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
-  const char *end = latin1_input + len;
-  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
-  // We always write 16 bytes, of which more than the first 8 bytes
-  // are valid. A safety margin of 8 is more than sufficient.
-  while (end - latin1_input >= 16 + 8) {
-    uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(latin1_input));
-    if (vmaxvq_u8(in8) <= 0x7F) { // ASCII fast path!!!!
-      vst1q_u8(utf8_output, in8);
-      utf8_output += 16;
-      latin1_input += 16;
-      continue;
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+    __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
+    // Check for too large input
+    const __m256i max_input =
+        _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
+    if (static_cast<uint32_t>(_mm256_movemask_epi8(
+            _mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
+      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+                            utf8_output);
     }
 
-    // We just fallback on UTF-16 code. This could be optimized/simplified
-    // further.
-    uint16x8_t in16 = vmovl_u8(vget_low_u8(in8));
-    // 1. prepare 2-byte values
-    // input 8-bit word : [aabb|bbbb] x 8
-    // expected output   : [1100|00aa|10bb|bbbb] x 8
-    const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
-    const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
+    // saturation
+    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
+                                        _mm256_and_si256(nextin, v_7fffffff));
+    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
 
-    // t0 = [0000|00aa|bbbb|bb00]
-    const uint16x8_t t0 = vshlq_n_u16(in16, 2);
-    // t1 = [0000|00aa|0000|0000]
-    const uint16x8_t t1 = vandq_u16(t0, v_1f00);
-    // t2 = [0000|0000|00bb|bbbb]
-    const uint16x8_t t2 = vandq_u16(in16, v_003f);
-    // t3 = [0000|00aa|00bb|bbbb]
-    const uint16x8_t t3 = vorrq_u16(t1, t2);
-    // t4 = [1100|00aa|10bb|bbbb]
-    const uint16x8_t t4 = vorrq_u16(t3, v_c080);
-    // 2. merge ASCII and 2-byte codewords
-    const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-    const uint16x8_t one_byte_bytemask = vcleq_u16(in16, v_007f);
-    const uint8x16_t utf8_unpacked =
-        vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in16, t4));
-    // 3. prepare bitmask for 8-bit lookup
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-    const uint16x8_t mask = simdutf_make_uint16x8_t(
-        0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
-#else
-    const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
-                             0x0002, 0x0008, 0x0020, 0x0080};
-#endif
-    uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
-    // 4. pack the bytes
-    const uint8_t *row =
-        &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-    const uint8x16_t shuffle = vld1q_u8(row + 1);
-    const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+    // Try to apply UTF-16 => UTF-8 routine on 256 bits
+    // (haswell/avx2_convert_utf16_to_utf8.cpp)
 
-    // 5. store bytes
-    vst1q_u8(utf8_output, utf8_packed);
-    // 6. adjust pointers
-    latin1_input += 8;
-    utf8_output += row[0];
+    if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+      // 1. pack the bytes
+      const __m128i utf8_packed = _mm_packus_epi16(
+          _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+      // 2. store (16 bytes)
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+      // 3. adjust pointers
+      buf += 16;
+      utf8_output += 16;
+      continue; // we are done for this round!
+    }
+    // no bits set above 7th bit
+    const __m256i one_byte_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+    const uint32_t one_byte_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
 
-  } // while
+    // no bits set above 11th bit
+    const __m256i one_or_two_bytes_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+    const uint32_t one_or_two_bytes_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+    if (one_or_two_bytes_bitmask == 0xffffffff) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
 
-  return std::make_pair(latin1_input, reinterpret_cast<char *>(utf8_output));
-}
-/* end file src/arm64/arm_convert_latin1_to_utf8.cpp */
+      // t0 = [000a|aaaa|bbbb|bb00]
+      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+      // t2 = [0000|0000|00bb|bbbb]
+      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+      // t3 = [000a|aaaa|00bb|bbbb]
+      const __m256i t3 = _mm256_or_si256(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      const __m256i t4 = _mm256_or_si256(t3, v_c080);
 
-/* begin file src/arm64/arm_convert_utf8_to_latin1.cpp */
-// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
-// end of the code points. Only the least significant 12 bits of the mask
-// are accessed.
-// It returns how many bytes were consumed (up to 16, usually 12).
-size_t convert_masked_utf8_to_latin1(const char *input,
-                                     uint64_t utf8_end_of_code_point_mask,
-                                     char *&latin1_output) {
-  // we use an approach where we try to process up to 12 input bytes.
-  // Why 12 input bytes and not 16? Because we are concerned with the size of
-  // the lookup tables. Also 12 is nicely divisible by two and three.
-  //
-  uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t *>(input));
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask & 0xfff;
-  //
-  // Optimization note: our main path below is load-latency dependent. Thus it
-  // is maybe beneficial to have fast paths that depend on branch prediction but
-  // have less latency. This results in more instructions but, potentially, also
-  // higher speeds.
+      // 2. merge ASCII and 2-byte codewords
+      const __m256i utf8_unpacked =
+          _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
 
-  // We first try a few fast paths.
-  // The obvious first test is ASCII, which actually consumes the full 16.
-  if (utf8_end_of_code_point_mask == 0xfff) {
-    // We process in chunks of 12 bytes
-    vst1q_u8(reinterpret_cast<uint8_t *>(latin1_output), in);
-    latin1_output += 12; // We wrote 12 18-bit characters.
-    return 12;           // We consumed 12 bytes.
-  }
-  /// We do not have a fast path available, or the fast path is unimportant, so
-  /// we fallback.
-  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
-      [input_utf8_end_of_code_point_mask][0];
+      // 3. prepare bitmask for 8-bit lookup
+      const uint32_t M0 = one_byte_bitmask & 0x55555555;
+      const uint32_t M1 = M0 >> 7;
+      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+      // 4. pack the bytes
 
-  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
-      [input_utf8_end_of_code_point_mask][1];
-  // this indicates an invalid input:
-  if (idx >= 64) {
-    return consumed;
-  }
-  // Here we should have (idx < 64), if not, there is a bug in the validation or
-  // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
-  // we process SIX (6) input code-code units. The max length in bytes of six
-  // code code units spanning between 1 and 2 bytes each is 12 bytes. Converts 6
-  // 1-2 byte UTF-8 characters to 6 UTF-16 characters. This is a relatively easy
-  // scenario we process SIX (6) input code-code units. The max length in bytes
-  // of six code code units spanning between 1 and 2 bytes each is 12 bytes.
-  uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
-      simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-  // Shuffle
-  // 1 byte: 00000000 0bbbbbbb
-  // 2 byte: 110aaaaa 10bbbbbb
-  uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh));
-  // Mask
-  // 1 byte: 00000000 0bbbbbbb
-  // 2 byte: 00000000 00bbbbbb
-  uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits
-  // 1 byte: 00000000 00000000
-  // 2 byte: 000aaaaa 00000000
-  uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits
-  // Combine with a shift right accumulate
-  // 1 byte: 00000000 0bbbbbbb
-  // 2 byte: 00000aaa aabbbbbb
-  uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2);
-  // writing 8 bytes even though we only care about the first 6 bytes.
-  uint8x8_t latin1_packed = vmovn_u16(composed);
-  vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
-  latin1_output += 6; // We wrote 6 bytes.
-  return consumed;
-}
-/* end file src/arm64/arm_convert_utf8_to_latin1.cpp */
-/* begin file src/arm64/arm_convert_utf8_to_utf16.cpp */
-// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
-// end of the code points. Only the least significant 12 bits of the mask
-// are accessed.
-// It returns how many bytes were consumed (up to 16, usually 12).
-template <endianness big_endian>
-size_t convert_masked_utf8_to_utf16(const char *input,
-                                    uint64_t utf8_end_of_code_point_mask,
-                                    char16_t *&utf16_output) {
-  // we use an approach where we try to process up to 12 input bytes.
-  // Why 12 input bytes and not 16? Because we are concerned with the size of
-  // the lookup tables. Also 12 is nicely divisible by two and three.
-  //
-  uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t *>(input));
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask & 0xfff;
-  //
-  // Optimization note: our main path below is load-latency dependent. Thus it
-  // is maybe beneficial to have fast paths that depend on branch prediction but
-  // have less latency. This results in more instructions but, potentially, also
-  // higher speeds.
+      const uint8_t *row =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+      const uint8_t *row_2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
+                                                                       16)][0];
 
-  // We first try a few fast paths.
-  // The obvious first test is ASCII, which actually consumes the full 16.
-  if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xffff) {
-    // We process in chunks of 16 bytes
-    // The routine in simd.h is reused.
-    simd8<int8_t> temp{vreinterpretq_s8_u8(in)};
-    temp.store_ascii_as_utf16<big_endian>(utf16_output);
-    utf16_output += 16; // We wrote 16 16-bit characters.
-    return 16;          // We consumed 16 bytes.
-  }
+      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
 
-  // 3 byte sequences are the next most common, as seen in CJK, which has long
-  // sequences of these.
-  if (input_utf8_end_of_code_point_mask == 0x924) {
-    // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
-    // UTF-16 code units.
-    uint16x4_t composed = convert_utf8_3_byte_to_utf16(in);
-    // Byte swap if necessary
-    if (!match_system(big_endian)) {
-      composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed)));
-    }
-    vst1_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
-    utf16_output += 4; // We wrote 4 16-bit characters.
-    return 12;         // We consumed 12 bytes.
-  }
+      const __m256i utf8_packed = _mm256_shuffle_epi8(
+          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+      // 5. store bytes
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_castsi256_si128(utf8_packed));
+      utf8_output += row[0];
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_extractf128_si256(utf8_packed, 1));
+      utf8_output += row_2[0];
 
-  // 2 byte sequences occur in short bursts in languages like Greek and Russian.
-  if ((utf8_end_of_code_point_mask & 0xFFF) == 0xaaa) {
-    // We want to take 6 2-byte UTF-8 code units and turn them into 6 2-byte
-    // UTF-16 code units.
-    uint16x8_t composed = convert_utf8_2_byte_to_utf16(in);
-    // Byte swap if necessary
-    if (!match_system(big_endian)) {
-      composed =
-          vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
+      // 6. adjust pointers
+      buf += 16;
+      continue;
     }
-    vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
+    // Must check for overflow in packing
+    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
+        _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+    const uint32_t saturation_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+    if (saturation_bitmask == 0xffffffff) {
+      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
 
-    utf16_output += 6; // We wrote 6 16-bit characters.
-    return 12;         // We consumed 12 bytes.
-  }
+      // Check for illegal surrogate code units
+      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+      const __m256i forbidden_bytemask =
+          _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
+      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
+          0x0) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start),
+                              utf8_output);
+      }
 
-  /// We do not have a fast path available, or the fast path is unimportant, so
-  /// we fallback.
-  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
-      [input_utf8_end_of_code_point_mask][0];
+      const __m256i dup_even = _mm256_setr_epi16(
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
 
-  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
-      [input_utf8_end_of_code_point_mask][1];
+      /* In this branch we handle three cases:
+        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+        single UFT-8 byte
+        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
+        UTF-8 bytes
+        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+        three UTF-8 bytes
 
-  if (idx < 64) {
-    // SIX (6) input code-code units
-    // Convert to UTF-16
-    uint16x8_t composed = convert_utf8_1_to_2_byte_to_utf16(in, idx);
-    // Byte swap if necessary
-    if (!match_system(big_endian)) {
-      composed =
-          vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
-    }
-    // Store
-    vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
-    utf16_output += 6; // We wrote 6 16-bit characters.
-    return consumed;
-  } else if (idx < 145) {
-    // FOUR (4) input code-code units
-    // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
-    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
-        simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-    // XXX: depending on the system scalar instructions might be faster.
-    // 1 byte: 00000000 00000000 0ccccccc
-    // 2 byte: 00000000 110bbbbb 10cccccc
-    // 3 byte: 1110aaaa 10bbbbbb 10cccccc
-    uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
-    // 1 byte: 00000000 0ccccccc
-    // 2 byte: xx0bbbbb x0cccccc
-    // 3 byte: xxbbbbbb x0cccccc
-    uint16x4_t lowperm = vmovn_u32(perm);
-    // Partially mask with bic (doesn't require a temporary register unlike and)
-    // The shift left insert below will clear the top bits.
-    // 1 byte: 00000000 00000000
-    // 2 byte: xx0bbbbb 00000000
-    // 3 byte: xxbbbbbb 00000000
-    uint16x4_t middlebyte = vbic_u16(lowperm, vmov_n_u16(uint16_t(~0xFF00)));
-    // ASCII
-    // 1 byte: 00000000 0ccccccc
-    // 2+byte: 00000000 00cccccc
-    uint16x4_t ascii = vand_u16(lowperm, vmov_n_u16(0x7F));
-    // Split into narrow vectors.
-    // 2 byte: 00000000 00000000
-    // 3 byte: 00000000 xxxxaaaa
-    uint16x4_t highperm = vshrn_n_u32(perm, 16);
-    // Shift right accumulate the middle byte
-    // 1 byte: 00000000 0ccccccc
-    // 2 byte: 00xx0bbb bbcccccc
-    // 3 byte: 00xxbbbb bbcccccc
-    uint16x4_t middlelow = vsra_n_u16(ascii, middlebyte, 2);
-    // Shift left and insert the top 4 bits, overwriting the garbage
-    // 1 byte: 00000000 0ccccccc
-    // 2 byte: 00000bbb bbcccccc
-    // 3 byte: aaaabbbb bbcccccc
-    uint16x4_t composed = vsli_n_u16(middlelow, highperm, 12);
-    // Byte swap if necessary
-    if (!match_system(big_endian)) {
-      composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed)));
-    }
-    vst1_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
+        We expand the input word (16-bit) into two code units (32-bit), thus
+        we have room for four bytes. However, we need five distinct bit
+        layouts. Note that the last byte in cases #2 and #3 is the same.
 
-    utf16_output += 4; // We wrote 4 16-bit codepoints
-    return consumed;
-  } else if (idx < 209) {
-    // THREE (3) input code-code units
-    if (input_utf8_end_of_code_point_mask == 0x888) {
-      // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
-      // UTF-16 pairs. Generating surrogate pairs is a little tricky though, but
-      // it is easier when we can assume they are all pairs. This version does
-      // not use the LUT, but 4 byte sequences are less common and the overhead
-      // of the extra memory access is less important than the early branch
-      // overhead in shorter sequences.
+        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+        in register t2.
 
-      // Swap byte pairs
-      // 10dddddd 10cccccc|10bbbbbb 11110aaa
-      // 10cccccc 10dddddd|11110aaa 10bbbbbb
-      uint8x16_t swap = vrev16q_u8(in);
-      // Shift left 2 bits
-      // cccccc00 dddddd00 xxxxxxxx bbbbbb00
-      uint32x4_t shift = vreinterpretq_u32_u8(vshlq_n_u8(swap, 2));
-      // Create a magic number containing the low 2 bits of the trail surrogate
-      // and all the corrections needed to create the pair. UTF-8 4b prefix   =
-      // -0x0000|0xF000 surrogate offset  = -0x0000|0x0040 (0x10000 << 6)
-      // surrogate high    = +0x0000|0xD800
-      // surrogate low     = +0xDC00|0x0000
-      // -------------------------------
-      //                   = +0xDC00|0xE7C0
-      uint32x4_t magic = vmovq_n_u32(0xDC00E7C0);
-      // Generate unadjusted trail surrogate minus lowest 2 bits
-      // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00
-      uint32x4_t trail =
-          vbslq_u32(vmovq_n_u32(0x0000FF00), vreinterpretq_u32_u8(swap), shift);
-      // Insert low 2 bits of trail surrogate to magic number for later
-      // 11011100 00000000 11100111 110000cc
-      uint16x8_t magic_with_low_2 =
-          vreinterpretq_u16_u32(vsraq_n_u32(magic, shift, 30));
-      // Generate lead surrogate
-      // xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx
-      uint32x4_t lead = vreinterpretq_u32_u16(
-          vsliq_n_u16(vreinterpretq_u16_u8(swap), vreinterpretq_u16_u8(in), 6));
-      // Mask out lead
-      // 000000cc ccdddddd|xxxxxxxx xxxxxxxx
-      lead = vbicq_u32(lead, vmovq_n_u32(uint32_t(~0x03FFFFFF)));
-      // Blend pairs
-      // 000000cc ccdddddd|11110aaa bbbbbb00
-      uint16x8_t blend = vreinterpretq_u16_u32(
-          vbslq_u32(vmovq_n_u32(0x0000FFFF), trail, lead));
-      // Add magic number to finish the result
-      // 110111CC CCDDDDDD|110110AA BBBBBBCC
-      uint16x8_t composed = vaddq_u16(blend, magic_with_low_2);
-      // Byte swap if necessary
-      if (!match_system(big_endian)) {
-        composed =
-            vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
-      }
-      uint16_t buffer[8];
-      vst1q_u16(reinterpret_cast<uint16_t *>(buffer), composed);
-      for (int k = 0; k < 6; k++) {
-        utf16_output[k] = buffer[k];
-      } // the loop might compiler to a couple of instructions.
-      utf16_output += 6; // We wrote 3 32-bit surrogate pairs.
-      return 12;         // We consumed 12 bytes.
-    }
-    // 3 1-4 byte sequences
-    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
-        simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+        either byte 1 for case #2 or byte 2 for case #3. Note that they
+        differ by exactly one bit.
 
-    // 1 byte: 00000000 00000000 00000000 0ddddddd
-    // 3 byte: 00000000 00000000 110ccccc 10dddddd
-    // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
-    // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
-    uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
-    // added to fix issue https://github.com/simdutf/simdutf/issues/514
-    // We only want to write 2 * 16-bit code units when that is actually what we
-    // have. Unfortunately, we cannot trust the input. So it is possible to get
-    // 0xff as an input byte and it should not result in a surrogate pair. We
-    // need to check for that.
-    uint32_t permbuffer[4];
-    vst1q_u32(permbuffer, perm);
-    // Mask the low and middle bytes
-    // 00000000 00000000 00000000 0ddddddd
-    uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7f));
-    // Because the surrogates need more work, the high surrogate is computed
-    // first.
-    uint32x4_t middlehigh = vshlq_n_u32(perm, 2);
-    // 00000000 00000000 00cccccc 00000000
-    uint32x4_t middlebyte = vandq_u32(perm, vmovq_n_u32(0x3F00));
-    // Start assembling the sequence. Since the 4th byte is in the same position
-    // as it would be in a surrogate and there is no dependency, shift left
-    // instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte:
-    // 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx
-    uint32x4_t ab = vbslq_u32(vmovq_n_u32(0xFF000000), perm, middlehigh);
-    // Top 16 bits contains the high ten bits of the surrogate pair before
-    // correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa
-    // bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction
-    uint32x4_t abc =
-        vbslq_u32(vmovq_n_u32(0xFFFC0000), ab, vshlq_n_u32(middlebyte, 4));
-    // Combine the low 6 or 7 bits by a shift right accumulate
-    // 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct
-    // 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o
-    // correction
-    uint32x4_t composed = vsraq_n_u32(ascii, abc, 6);
-    // After this is for surrogates
-    // Blend the low and high surrogates
-    // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd
-    uint32x4_t mixed = vbslq_u32(vmovq_n_u32(0xFFFF0000), abc, composed);
-    // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits
-    // yet as 0x10000 was not subtracted from the codepoint yet. 4 byte:
-    // 11110aaa bbbbbbcc|000000cc ccdddddd
-    uint16x8_t masked_pair = vreinterpretq_u16_u32(
-        vbicq_u32(mixed, vmovq_n_u32(uint32_t(~0xFFFF03FF))));
-    // Correct the remaining UTF-8 prefix, surrogate offset, and add the
-    // surrogate prefixes in one magic 16-bit addition. similar magic number but
-    // without the continue byte adjust and halfword swapped UTF-8 4b prefix   =
-    // -0xF000|0x0000 surrogate offset  = -0x0040|0x0000 (0x10000 << 6)
-    // surrogate high    = +0xD800|0x0000
-    // surrogate low     = +0x0000|0xDC00
-    // -----------------------------------
-    //                   = +0xE7C0|0xDC00
-    uint16x8_t magic = vreinterpretq_u16_u32(vmovq_n_u32(0xE7C0DC00));
-    // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete
-    uint32x4_t surrogates =
-        vreinterpretq_u32_u16(vaddq_u16(masked_pair, magic));
-    // If the high bit is 1 (s32 less than zero), this needs a surrogate pair
-    uint32x4_t is_pair = vcltzq_s32(vreinterpretq_s32_u32(perm));
+        Finally from these two code units we build proper UTF-8 sequence, taking
+        into account the case (i.e, the number of bytes to write).
+      */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
+                                             simdutf_vec(0b0100000000000000));
+      const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+      // 4. expand code units 16-bit => 32-bit
+      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+      // Due to the wider registers, the following path is less likely to be
+      // useful.
+      /*if(mask == 0) {
+        // We only have three-byte code units. Use fast path.
+        const __m256i shuffle =
+      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
+      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
+      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
+      _mm256_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+        continue;
+      }*/
+      const uint8_t mask0 = uint8_t(mask);
+      const uint8_t *row0 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+      const __m128i utf8_0 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
 
-    // Select either the 4 byte surrogate pair or the 2 byte solo codepoint
-    // 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd
-    // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD
-    uint32x4_t selected = vbslq_u32(is_pair, surrogates, composed);
-    // Byte swap if necessary
-    if (!match_system(big_endian)) {
-      selected =
-          vreinterpretq_u32_u8(vrev16q_u8(vreinterpretq_u8_u32(selected)));
-    }
-    // Attempting to shuffle and store would be complex, just scalarize.
-    uint32_t buffer[4];
-    vst1q_u32(buffer, selected);
-    // Test for the top bit of the surrogate mask. Remove due to issue 514
-    // const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 :
-    // 0x00800000;
-    for (size_t i = 0; i < 3; i++) {
-      // Surrogate
-      // Used to be if (buffer[i] & SURROGATE_MASK) {
-      // See discussion above.
-      // patch for issue https://github.com/simdutf/simdutf/issues/514
-      if ((permbuffer[i] & 0xf8000000) == 0xf0000000) {
-        utf16_output[0] = uint16_t(buffer[i] >> 16);
-        utf16_output[1] = uint16_t(buffer[i] & 0xFFFF);
-        utf16_output += 2;
-      } else {
-        utf16_output[0] = uint16_t(buffer[i] & 0xFFFF);
-        utf16_output++;
+      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+      const uint8_t *row1 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+      const __m128i utf8_1 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+      const uint8_t *row2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
+      const __m128i utf8_2 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+      const uint8_t *row3 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
+      const __m128i utf8_3 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+      utf8_output += row0[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+      utf8_output += row1[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
+      utf8_output += row2[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
+      utf8_output += row3[0];
+      buf += 16;
+    } else {
+      // case: at least one 32-bit word is larger than 0xFFFF <=> it will
+      // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
+      // wasteful to use scalar code, but being efficient with SIMD may require
+      // large, non-trivial tables?
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+          *utf8_output++ = char(word);
+        } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k), utf8_output);
+          }
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else { // 4-byte
+          if (word > 0x10FFFF) {
+            return std::make_pair(
+                result(error_code::TOO_LARGE, buf - start + k), utf8_output);
+          }
+          *utf8_output++ = char((word >> 18) | 0b11110000);
+          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        }
       }
+      buf += k;
     }
-    return consumed;
-  } else {
-    // here we know that there is an error but we do not handle errors
-    return 12;
-  }
+  } // while
+
+  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
-/* end file src/arm64/arm_convert_utf8_to_utf16.cpp */
-/* begin file src/arm64/arm_convert_utf8_to_utf32.cpp */
-// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
-// end of the code points. Only the least significant 12 bits of the mask
-// are accessed.
-// It returns how many bytes were consumed (up to 12).
-size_t convert_masked_utf8_to_utf32(const char *input,
-                                    uint64_t utf8_end_of_code_point_mask,
-                                    char32_t *&utf32_out) {
-  // we use an approach where we try to process up to 12 input bytes.
-  // Why 12 input bytes and not 16? Because we are concerned with the size of
-  // the lookup tables. Also 12 is nicely divisible by two and three.
-  //
-  uint32_t *&utf32_output = reinterpret_cast<uint32_t *&>(utf32_out);
-  uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t *>(input));
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask & 0xFFF;
-  //
-  // Optimization note: our main path below is load-latency dependent. Thus it
-  // is maybe beneficial to have fast paths that depend on branch prediction but
-  // have less latency. This results in more instructions but, potentially, also
-  // higher speeds.
-  //
-  // We first try a few fast paths.
-  if (utf8_end_of_code_point_mask == 0xfff) {
-    // We process in chunks of 12 bytes.
-    // use fast implementation in src/simdutf/arm64/simd.h
-    // Ideally the compiler can keep the tables in registers.
-    simd8<int8_t> temp{vreinterpretq_s8_u8(in)};
-    temp.store_ascii_as_utf32_tbl(utf32_out);
-    utf32_output += 12; // We wrote 12 32-bit characters.
-    return 12;          // We consumed 12 bytes.
-  }
-  if (input_utf8_end_of_code_point_mask == 0x924) {
-    // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
-    // UTF-32 code units. Convert to UTF-16
-    uint16x4_t composed_utf16 = convert_utf8_3_byte_to_utf16(in);
-    // Zero extend and store via ST2 with a zero.
-    uint16x4x2_t interleaver = {{composed_utf16, vmov_n_u16(0)}};
-    vst2_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
-    utf32_output += 4; // We wrote 4 32-bit characters.
-    return 12;         // We consumed 12 bytes.
-  }
+/* end file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */
+/* begin file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */
+// file included directly
 
-  // 2 byte sequences occur in short bursts in languages like Greek and Russian.
-  if (input_utf8_end_of_code_point_mask == 0xaaa) {
-    // We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte
-    // UTF-32 code units. Convert to UTF-16
-    uint16x8_t composed_utf16 = convert_utf8_2_byte_to_utf16(in);
-    // Zero extend and store via ST2 with a zero.
-    uint16x8x2_t interleaver = {{composed_utf16, vmovq_n_u16(0)}};
-    vst2q_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
-    utf32_output += 6; // We wrote 6 32-bit characters.
-    return 12;         // We consumed 12 bytes.
-  }
-  /// Either no fast path or an unimportant fast path.
+// Todo: currently, this is just the haswell code, optimize for icelake kernel.
+template <endianness big_endian>
+std::pair<const char32_t *, char16_t *>
+avx512_convert_utf32_to_utf16(const char32_t *buf, size_t len,
+                              char16_t *utf16_output) {
+  const char32_t *end = buf + len;
 
-  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
-      [input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
-      [input_utf8_end_of_code_point_mask][1];
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+  __m256i forbidden_bytemask = _mm256_setzero_si256();
 
-  if (idx < 64) {
-    // SIX (6) input code-code units
-    // Convert to UTF-16
-    uint16x8_t composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx);
-    // Zero extend and store with ST2 and zero
-    uint16x8x2_t interleaver = {{composed_utf16, vmovq_n_u16(0)}};
-    vst2q_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
-    utf32_output += 6; // We wrote 6 32-bit characters.
-    return consumed;
-  } else if (idx < 145) {
-    // FOUR (4) input code-code units
-    // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
-    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
-        simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-    // Shuffle
-    // 1 byte: 00000000 00000000 0ccccccc
-    // 2 byte: 00000000 110bbbbb 10cccccc
-    // 3 byte: 1110aaaa 10bbbbbb 10cccccc
-    uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
-    // Split
-    // 00000000 00000000 0ccccccc
-    uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F)); // 6 or 7 bits
-    // Note: unmasked
-    // xxxxxxxx aaaaxxxx xxxxxxxx
-    uint32x4_t high = vshrq_n_u32(perm, 4); // 4 bits
-    // Use 16 bit bic instead of and.
-    // The top bits will be corrected later in the bsl
-    // 00000000 10bbbbbb 00000000
-    uint32x4_t middle = vreinterpretq_u32_u16(
-        vbicq_u16(vreinterpretq_u16_u32(perm),
-                  vmovq_n_u16(uint16_t(~0xff00)))); // 5 or 6 bits
-    // Combine low and middle with shift right accumulate
-    // 00000000 00xxbbbb bbcccccc
-    uint32x4_t lowmid = vsraq_n_u32(ascii, middle, 2);
-    // Insert top 4 bits from high byte with bitwise select
-    // 00000000 aaaabbbb bbcccccc
-    uint32x4_t composed = vbslq_u32(vmovq_n_u32(0x0000F000), high, lowmid);
-    vst1q_u32(utf32_output, composed);
-    utf32_output += 4; // We wrote 4 32-bit characters.
-    return consumed;
-  } else if (idx < 209) {
-    // THREE (3) input code-code units
-    if (input_utf8_end_of_code_point_mask == 0x888) {
-      // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
-      // UTF-32 code units. This uses the same method as the fixed 3 byte
-      // version, reversing and shift left insert. However, there is no need for
-      // a shuffle mask now, just rev16 and rev32.
-      //
-      // This version does not use the LUT, but 4 byte sequences are less common
-      // and the overhead of the extra memory access is less important than the
-      // early branch overhead in shorter sequences, so it comes last.
+  while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
 
-      // Swap pairs of bytes
-      // 10dddddd|10cccccc|10bbbbbb|11110aaa
-      // 10cccccc 10dddddd|11110aaa 10bbbbbb
-      uint16x8_t swap1 = vreinterpretq_u16_u8(vrev16q_u8(in));
-      // Shift left and insert
-      // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb
-      uint16x8_t merge1 = vsliq_n_u16(swap1, vreinterpretq_u16_u8(in), 6);
-      // Swap 16-bit lanes
-      // xxxxcccc ccdddddd xxxxxxxa aabbbbbb
-      // xxxxxxxa aabbbbbb xxxxcccc ccdddddd
-      uint32x4_t swap2 = vreinterpretq_u32_u16(vrev32q_u16(merge1));
-      // Shift insert again
-      // xxxxxxxx xxxaaabb bbbbcccc ccdddddd
-      uint32x4_t merge2 = vsliq_n_u32(swap2, vreinterpretq_u32_u16(merge1), 12);
-      // Clear the garbage
-      // 00000000 000aaabb bbbbcccc ccdddddd
-      uint32x4_t composed = vandq_u32(merge2, vmovq_n_u32(0x1FFFFF));
-      // Store
-      vst1q_u32(utf32_output, composed);
+    const __m256i v_00000000 = _mm256_setzero_si256();
+    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
 
-      utf32_output += 3; // We wrote 3 32-bit characters.
-      return 12;         // We consumed 12 bytes.
-    }
-    // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit
-    // due to surrogates no longer being involved.
-    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
-        simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-    // 1 byte: 00000000 00000000 00000000 0ddddddd
-    // 2 byte: 00000000 00000000 110ccccc 10dddddd
-    // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
-    // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
-    uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
-    // Ascii
-    uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F));
-    uint32x4_t middle = vandq_u32(perm, vmovq_n_u32(0x3f00));
-    // When converting the way we do, the 3 byte prefix will be interpreted as
-    // the 18th bit being set, since the code would interpret the lead byte
-    // (0b1110bbbb) as a continuation byte (0b10bbbbbb). To fix this, we can
-    // either xor or do an 8 bit add of the 6th bit shifted right by 1. Since
-    // NEON has shift right accumulate, we use that.
-    //  4 byte   3 byte
-    // 10bbbbbb 1110bbbb
-    // 00000000 01000000 6th bit
-    // 00000000 00100000 shift right
-    // 10bbbbbb 0000bbbb add
-    // 00bbbbbb 0000bbbb mask
-    uint8x16_t correction =
-        vreinterpretq_u8_u32(vandq_u32(perm, vmovq_n_u32(0x00400000)));
-    uint32x4_t corrected = vreinterpretq_u32_u8(
-        vsraq_n_u8(vreinterpretq_u8_u32(perm), correction, 1));
-    // 00000000 00000000 0000cccc ccdddddd
-    uint32x4_t cd = vsraq_n_u32(ascii, middle, 2);
-    // Insert twice
-    // xxxxxxxx xxxaaabb bbbbxxxx xxxxxxxx
-    uint32x4_t ab = vbslq_u32(vmovq_n_u32(0x01C0000), vshrq_n_u32(corrected, 6),
-                              vshrq_n_u32(corrected, 4));
-    // 00000000 000aaabb bbbbcccc ccdddddd
-    uint32x4_t composed = vbslq_u32(vmovq_n_u32(0xFFE00FFF), cd, ab);
-    // Store
-    vst1q_u32(utf32_output, composed);
-    utf32_output += 3; // We wrote 3 32-bit characters.
-    return consumed;
-  } else {
-    // here we know that there is an error but we do not handle errors
-    return 12;
-  }
-}
-/* end file src/arm64/arm_convert_utf8_to_utf32.cpp */
+    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+    const __m256i saturation_bytemask =
+        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+    const uint32_t saturation_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
 
-/* begin file src/arm64/arm_convert_utf16_to_latin1.cpp */
+    if (saturation_bitmask == 0xffffffff) {
+      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+      forbidden_bytemask = _mm256_or_si256(
+          forbidden_bytemask,
+          _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
 
-template <endianness big_endian>
-std::pair<const char16_t *, char *>
-arm_convert_utf16_to_latin1(const char16_t *buf, size_t len,
-                            char *latin1_output) {
-  const char16_t *end = buf + len;
-  while (end - buf >= 8) {
-    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
-    if (!match_system(big_endian)) {
-      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
-    }
-    if (vmaxvq_u16(in) <= 0xff) {
-      // 1. pack the bytes
-      uint8x8_t latin1_packed = vmovn_u16(in);
-      // 2. store (8 bytes)
-      vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
-      // 3. adjust pointers
+      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
+                                              _mm256_extractf128_si256(in, 1));
+      if (big_endian) {
+        const __m128i swap =
+            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+      }
+      _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
+      utf16_output += 8;
       buf += 8;
-      latin1_output += 8;
     } else {
-      return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
+      size_t forward = 7;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFF0000) == 0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(nullptr, utf16_output);
+          }
+          *utf16_output++ =
+              big_endian
+                  ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
+                  : char16_t(word);
+        } else {
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) {
+            return std::make_pair(nullptr, utf16_output);
+          }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (big_endian) {
+            high_surrogate =
+                uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+            low_surrogate =
+                uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
+        }
+      }
+      buf += k;
     }
-  } // while
-  return std::make_pair(buf, latin1_output);
+  }
+
+  // check for invalid input
+  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+    return std::make_pair(nullptr, utf16_output);
+  }
+
+  return std::make_pair(buf, utf16_output);
 }
 
+// Todo: currently, this is just the haswell code, optimize for icelake kernel.
 template <endianness big_endian>
-std::pair<result, char *>
-arm_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
-                                        char *latin1_output) {
-  const char16_t *start = buf;
-  const char16_t *end = buf + len;
-  while (end - buf >= 8) {
-    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
-    if (!match_system(big_endian)) {
-      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
-    }
-    if (vmaxvq_u16(in) <= 0xff) {
-      // 1. pack the bytes
-      uint8x8_t latin1_packed = vmovn_u16(in);
-      // 2. store (8 bytes)
-      vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
-      // 3. adjust pointers
+std::pair<result, char16_t *>
+avx512_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
+                                          char16_t *utf16_output) {
+  const char32_t *start = buf;
+  const char32_t *end = buf + len;
+
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+
+  while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+
+    const __m256i v_00000000 = _mm256_setzero_si256();
+    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+
+    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+    const __m256i saturation_bytemask =
+        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+    const uint32_t saturation_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+
+    if (saturation_bitmask == 0xffffffff) {
+      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+      const __m256i forbidden_bytemask =
+          _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
+      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
+          0x0) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start),
+                              utf16_output);
+      }
+
+      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
+                                              _mm256_extractf128_si256(in, 1));
+      if (big_endian) {
+        const __m128i swap =
+            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+      }
+      _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
+      utf16_output += 8;
       buf += 8;
-      latin1_output += 8;
     } else {
-      // Let us do a scalar fallback.
-      for (int k = 0; k < 8; k++) {
-        uint16_t word = !match_system(big_endian)
-                            ? scalar::utf16::swap_bytes(buf[k])
-                            : buf[k];
-        if (word <= 0xff) {
-          *latin1_output++ = char(word);
+      size_t forward = 7;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFF0000) == 0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k), utf16_output);
+          }
+          *utf16_output++ =
+              big_endian
+                  ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
+                  : char16_t(word);
         } else {
-          return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
-                                latin1_output);
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) {
+            return std::make_pair(
+                result(error_code::TOO_LARGE, buf - start + k), utf16_output);
+          }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (big_endian) {
+            high_surrogate =
+                uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+            low_surrogate =
+                uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
         }
       }
+      buf += k;
     }
-  } // while
-  return std::make_pair(result(error_code::SUCCESS, buf - start),
-                        latin1_output);
+  }
+
+  return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
+}
+/* end file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */
+/* begin file src/icelake/icelake_ascii_validation.inl.cpp */
+// file included directly
+
+bool validate_ascii(const char *buf, size_t len) {
+  const char *end = buf + len;
+  const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
+  __m512i running_or = _mm512_setzero_si512();
+  for (; end - buf >= 64; buf += 64) {
+    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)buf);
+    running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii,
+                                           0xf8); // running_or | (utf8 & ascii)
+  }
+  if (buf < end) {
+    const __m512i utf8 = _mm512_maskz_loadu_epi8(
+        (uint64_t(1) << (end - buf)) - 1, (const __m512i *)buf);
+    running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii,
+                                           0xf8); // running_or | (utf8 & ascii)
+  }
+  return (_mm512_test_epi8_mask(running_or, running_or) == 0);
 }
-/* end file src/arm64/arm_convert_utf16_to_latin1.cpp */
-/* begin file src/arm64/arm_convert_utf16_to_utf32.cpp */
-/*
-    The vectorized algorithm works on single SSE register i.e., it
-    loads eight 16-bit code units.
+/* end file src/icelake/icelake_ascii_validation.inl.cpp */
+/* begin file src/icelake/icelake_utf32_validation.inl.cpp */
+// file included directly
 
-    We consider three cases:
-    1. an input register contains no surrogates and each value
-       is in range 0x0000 .. 0x07ff.
-    2. an input register contains no surrogates and values are
-       is in range 0x0000 .. 0xffff.
-    3. an input register contains surrogates --- i.e. codepoints
-       can have 16 or 32 bits.
+const char32_t *validate_utf32(const char32_t *buf, size_t len) {
+  if (len < 16) {
+    return buf;
+  }
+  const char32_t *end = buf + len - 16;
 
-    Ad 1.
+  const __m512i offset = _mm512_set1_epi32((uint32_t)0xffff2000);
+  __m512i currentmax = _mm512_setzero_si512();
+  __m512i currentoffsetmax = _mm512_setzero_si512();
 
-    When values are less than 0x0800, it means that a 16-bit code unit
-    can be converted into: 1) single UTF8 byte (when it is an ASCII
-    char) or 2) two UTF8 bytes.
+  while (buf <= end) {
+    __m512i utf32 = _mm512_loadu_si512((const __m512i *)buf);
+    buf += 16;
+    currentoffsetmax =
+        _mm512_max_epu32(_mm512_add_epi32(utf32, offset), currentoffsetmax);
+    currentmax = _mm512_max_epu32(utf32, currentmax);
+  }
 
-    For this case we do only some shuffle to obtain these 2-byte
-    codes and finally compress the whole SSE register with a single
-    shuffle.
+  const __m512i standardmax = _mm512_set1_epi32((uint32_t)0x10ffff);
+  const __m512i standardoffsetmax = _mm512_set1_epi32((uint32_t)0xfffff7ff);
+  __m512i is_zero =
+      _mm512_xor_si512(_mm512_max_epu32(currentmax, standardmax), standardmax);
+  if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) {
+    return nullptr;
+  }
+  is_zero = _mm512_xor_si512(
+      _mm512_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
+  if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) {
+    return nullptr;
+  }
 
-    We need 256-entry lookup table to get a compression pattern
-    and the number of output bytes in the compressed vector register.
-    Each entry occupies 17 bytes.
+  return buf;
+}
+/* end file src/icelake/icelake_utf32_validation.inl.cpp */
+/* begin file src/icelake/icelake_convert_latin1_to_utf8.inl.cpp */
+// file included directly
 
-    Ad 2.
+static inline size_t latin1_to_utf8_avx512_vec(__m512i input, size_t input_len,
+                                               char *utf8_output,
+                                               int mask_output) {
+  __mmask64 nonascii = _mm512_movepi8_mask(input);
+  size_t output_size = input_len + (size_t)count_ones(nonascii);
 
-    When values fit in 16-bit code units, but are above 0x07ff, then
-    a single word may produce one, two or three UTF8 bytes.
+  // Mask to denote whether the byte is a leading byte that is not ascii
+  __mmask64 sixth = _mm512_cmpge_epu8_mask(
+      input, _mm512_set1_epi8(-64)); // binary representation of -64: 1100 0000
 
-    We prepare data for all these three cases in two registers.
-    The first register contains lower two UTF8 bytes (used in all
-    cases), while the second one contains just the third byte for
-    the three-UTF8-bytes case.
+  const uint64_t alternate_bits = UINT64_C(0x5555555555555555);
+  uint64_t ascii = ~nonascii;
+  // the bits in ascii are inverted and zeros are interspersed in between them
+  uint64_t maskA = ~_pdep_u64(ascii, alternate_bits);
+  uint64_t maskB = ~_pdep_u64(ascii >> 32, alternate_bits);
 
-    Finally these two registers are interleaved forming eight-element
-    array of 32-bit values. The array spans two SSE registers.
-    The bytes from the registers are compressed using two shuffles.
+  // interleave bytes from top and bottom halves (abcd...ABCD -> aAbBcCdD)
+  __m512i input_interleaved = _mm512_permutexvar_epi8(
+      _mm512_set_epi32(0x3f1f3e1e, 0x3d1d3c1c, 0x3b1b3a1a, 0x39193818,
+                       0x37173616, 0x35153414, 0x33133212, 0x31113010,
+                       0x2f0f2e0e, 0x2d0d2c0c, 0x2b0b2a0a, 0x29092808,
+                       0x27072606, 0x25052404, 0x23032202, 0x21012000),
+      input);
 
-    We need 256-entry lookup table to get a compression pattern
-    and the number of output bytes in the compressed vector register.
-    Each entry occupies 17 bytes.
+  // double size of each byte, and insert the leading byte 1100 0010
 
+  /*
+  upscale the bytes to 16-bit value, adding the 0b11000000 leading byte in the
+  process. We adjust for the bytes that have their two most significant bits.
+  This takes care of the first 32 bytes, assuming we interleaved the bytes. */
+  __m512i outputA =
+      _mm512_shldi_epi16(input_interleaved, _mm512_set1_epi8(-62), 8);
+  outputA = _mm512_mask_add_epi16(
+      outputA, (__mmask32)sixth, outputA,
+      _mm512_set1_epi16(1 - 0x4000)); // 1- 0x4000 = 1100 0000 0000 0001????
 
-    To summarize:
-    - We need two 256-entry tables that have 8704 bytes in total.
-*/
-/*
-  Returns a pair: the first unprocessed byte from buf and utf8_output
-  A scalar routing should carry on the conversion of the tail.
-*/
-template <endianness big_endian>
-std::pair<const char16_t *, char32_t *>
-arm_convert_utf16_to_utf32(const char16_t *buf, size_t len,
-                           char32_t *utf32_out) {
-  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
-  const char16_t *end = buf + len;
+  // in the second 32-bit half, set first or second option based on whether
+  // original input is leading byte (second case) or not (first case)
+  __m512i leadingB =
+      _mm512_mask_blend_epi16((__mmask32)(sixth >> 32),
+                              _mm512_set1_epi16(0x00c2),  // 0000 0000 1101 0010
+                              _mm512_set1_epi16(0x40c3)); // 0100 0000 1100 0011
+  __m512i outputB = _mm512_ternarylogic_epi32(
+      input_interleaved, leadingB, _mm512_set1_epi16((short)0xff00),
+      (240 & 170) ^ 204); // (input_interleaved & 0xff00) ^ leadingB
 
-  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
-  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+  // prune redundant bytes
+  outputA = _mm512_maskz_compress_epi8(maskA, outputA);
+  outputB = _mm512_maskz_compress_epi8(maskB, outputB);
 
-  while (end - buf >= 8) {
-    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
-    if (!match_system(big_endian)) {
-      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
-    }
+  size_t output_sizeA = (size_t)count_ones((uint32_t)nonascii) + 32;
 
-    const uint16x8_t surrogates_bytemask =
-        vceqq_u16(vandq_u16(in, v_f800), v_d800);
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
-    // However, it is likely an uncommon occurrence.
-    if (vmaxvq_u16(surrogates_bytemask) == 0) {
-      // case: no surrogate pairs, extend all 16-bit code units to 32-bit code
-      // units
-      vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in)));
-      vst1q_u32(utf32_output + 4, vmovl_high_u16(in));
-      utf32_output += 8;
-      buf += 8;
-      // surrogate pair(s) in a register
+  if (mask_output) {
+    if (input_len > 32) { // is the second half of the input vector used?
+      __mmask64 write_mask = _bzhi_u64(~0ULL, (unsigned int)output_sizeA);
+      _mm512_mask_storeu_epi8(utf8_output, write_mask, outputA);
+      utf8_output += output_sizeA;
+      write_mask = _bzhi_u64(~0ULL, (unsigned int)(output_size - output_sizeA));
+      _mm512_mask_storeu_epi8(utf8_output, write_mask, outputB);
     } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint16_t word = !match_system(big_endian)
-                            ? scalar::utf16::swap_bytes(buf[k])
-                            : buf[k];
-        if ((word & 0xF800) != 0xD800) {
-          *utf32_output++ = char32_t(word);
-        } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = !match_system(big_endian)
-                                   ? scalar::utf16::swap_bytes(buf[k + 1])
-                                   : buf[k + 1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if ((diff | diff2) > 0x3FF) {
-            return std::make_pair(nullptr,
-                                  reinterpret_cast<char32_t *>(utf32_output));
-          }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf32_output++ = char32_t(value);
-        }
-      }
-      buf += k;
+      __mmask64 write_mask = _bzhi_u64(~0ULL, (unsigned int)output_size);
+      _mm512_mask_storeu_epi8(utf8_output, write_mask, outputA);
     }
-  } // while
-  return std::make_pair(buf, reinterpret_cast<char32_t *>(utf32_output));
+  } else {
+    _mm512_storeu_si512(utf8_output, outputA);
+    utf8_output += output_sizeA;
+    _mm512_storeu_si512(utf8_output, outputB);
+  }
+  return output_size;
 }
 
-/*
-  Returns a pair: a result struct and utf8_output.
-  If there is an error, the count field of the result is the position of the
-  error. Otherwise, it is the position of the first unprocessed byte in buf
-  (even if finished). A scalar routing should carry on the conversion of the
-  tail if needed.
-*/
-template <endianness big_endian>
-std::pair<result, char32_t *>
-arm_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
-                                       char32_t *utf32_out) {
-  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
-  const char16_t *start = buf;
-  const char16_t *end = buf + len;
+static inline size_t latin1_to_utf8_avx512_branch(__m512i input,
+                                                  char *utf8_output) {
+  __mmask64 nonascii = _mm512_movepi8_mask(input);
+  if (nonascii) {
+    return latin1_to_utf8_avx512_vec(input, 64, utf8_output, 0);
+  } else {
+    _mm512_storeu_si512(utf8_output, input);
+    return 64;
+  }
+}
 
-  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
-  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+size_t latin1_to_utf8_avx512_start(const char *buf, size_t len,
+                                   char *utf8_output) {
+  char *start = utf8_output;
+  size_t pos = 0;
+  // if there's at least 128 bytes remaining, we don't need to mask the output
+  for (; pos + 128 <= len; pos += 64) {
+    __m512i input = _mm512_loadu_si512((__m512i *)(buf + pos));
+    utf8_output += latin1_to_utf8_avx512_branch(input, utf8_output);
+  }
+  // in the last 128 bytes, the first 64 may require masking the output
+  if (pos + 64 <= len) {
+    __m512i input = _mm512_loadu_si512((__m512i *)(buf + pos));
+    utf8_output += latin1_to_utf8_avx512_vec(input, 64, utf8_output, 1);
+    pos += 64;
+  }
+  // with the last 64 bytes, the input also needs to be masked
+  if (pos < len) {
+    __mmask64 load_mask = _bzhi_u64(~0ULL, (unsigned int)(len - pos));
+    __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)(buf + pos));
+    utf8_output += latin1_to_utf8_avx512_vec(input, len - pos, utf8_output, 1);
+  }
+  return (size_t)(utf8_output - start);
+}
+/* end file src/icelake/icelake_convert_latin1_to_utf8.inl.cpp */
+/* begin file src/icelake/icelake_convert_latin1_to_utf16.inl.cpp */
+// file included directly
+template <endianness big_endian>
+size_t icelake_convert_latin1_to_utf16(const char *latin1_input, size_t len,
+                                       char16_t *utf16_output) {
+  size_t rounded_len = len & ~0x1F; // Round down to nearest multiple of 32
 
-  while ((end - buf) >= 8) {
-    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
-    if (!match_system(big_endian)) {
-      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
+  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  for (size_t i = 0; i < rounded_len; i += 32) {
+    // Load 32 Latin1 characters into a 256-bit register
+    __m256i in = _mm256_loadu_si256((__m256i *)&latin1_input[i]);
+    // Zero extend each set of 8 Latin1 characters to 32 16-bit integers
+    __m512i out = _mm512_cvtepu8_epi16(in);
+    if (big_endian) {
+      out = _mm512_shuffle_epi8(out, byteflip);
     }
+    // Store the results back to memory
+    _mm512_storeu_si512((__m512i *)&utf16_output[i], out);
+  }
+  if (rounded_len != len) {
+    uint32_t mask = uint32_t(1 << (len - rounded_len)) - 1;
+    __m256i in = _mm256_maskz_loadu_epi8(mask, latin1_input + rounded_len);
 
-    const uint16x8_t surrogates_bytemask =
-        vceqq_u16(vandq_u16(in, v_f800), v_d800);
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
-    // However, it is likely an uncommon occurrence.
-    if (vmaxvq_u16(surrogates_bytemask) == 0) {
-      // case: no surrogate pairs, extend all 16-bit code units to 32-bit code
-      // units
-      vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in)));
-      vst1q_u32(utf32_output + 4, vmovl_high_u16(in));
-      utf32_output += 8;
-      buf += 8;
-      // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint16_t word = !match_system(big_endian)
-                            ? scalar::utf16::swap_bytes(buf[k])
-                            : buf[k];
-        if ((word & 0xF800) != 0xD800) {
-          *utf32_output++ = char32_t(word);
-        } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = !match_system(big_endian)
-                                   ? scalar::utf16::swap_bytes(buf[k + 1])
-                                   : buf[k + 1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if ((diff | diff2) > 0x3FF) {
-            return std::make_pair(
-                result(error_code::SURROGATE, buf - start + k - 1),
-                reinterpret_cast<char32_t *>(utf32_output));
-          }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf32_output++ = char32_t(value);
-        }
-      }
-      buf += k;
+    // Zero extend each set of 8 Latin1 characters to 32 16-bit integers
+    __m512i out = _mm512_cvtepu8_epi16(in);
+    if (big_endian) {
+      out = _mm512_shuffle_epi8(out, byteflip);
     }
-  } // while
-  return std::make_pair(result(error_code::SUCCESS, buf - start),
-                        reinterpret_cast<char32_t *>(utf32_output));
+    // Store the results back to memory
+    _mm512_mask_storeu_epi16(utf16_output + rounded_len, mask, out);
+  }
+
+  return len;
 }
-/* end file src/arm64/arm_convert_utf16_to_utf32.cpp */
-/* begin file src/arm64/arm_convert_utf16_to_utf8.cpp */
-/*
-    The vectorized algorithm works on single SSE register i.e., it
-    loads eight 16-bit code units.
+/* end file src/icelake/icelake_convert_latin1_to_utf16.inl.cpp */
+/* begin file src/icelake/icelake_convert_latin1_to_utf32.inl.cpp */
+std::pair<const char *, char32_t *>
+avx512_convert_latin1_to_utf32(const char *buf, size_t len,
+                               char32_t *utf32_output) {
+  size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
 
-    We consider three cases:
-    1. an input register contains no surrogates and each value
-       is in range 0x0000 .. 0x07ff.
-    2. an input register contains no surrogates and values are
-       is in range 0x0000 .. 0xffff.
-    3. an input register contains surrogates --- i.e. codepoints
-       can have 16 or 32 bits.
+  for (size_t i = 0; i < rounded_len; i += 16) {
+    // Load 16 Latin1 characters into a 128-bit register
+    __m128i in = _mm_loadu_si128((__m128i *)&buf[i]);
 
-    Ad 1.
+    // Zero extend each set of 8 Latin1 characters to 16 32-bit integers using
+    // vpmovzxbd
+    __m512i out = _mm512_cvtepu8_epi32(in);
 
-    When values are less than 0x0800, it means that a 16-bit code unit
-    can be converted into: 1) single UTF8 byte (when it is an ASCII
-    char) or 2) two UTF8 bytes.
+    // Store the results back to memory
+    _mm512_storeu_si512((__m512i *)&utf32_output[i], out);
+  }
 
-    For this case we do only some shuffle to obtain these 2-byte
-    codes and finally compress the whole SSE register with a single
-    shuffle.
+  // Return pointers pointing to where we left off
+  return std::make_pair(buf + rounded_len, utf32_output + rounded_len);
+}
+/* end file src/icelake/icelake_convert_latin1_to_utf32.inl.cpp */
+/* begin file src/icelake/icelake_base64.inl.cpp */
+// file included directly
+/**
+ * References and further reading:
+ *
+ * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
+ * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
+ * https://arxiv.org/abs/1910.05109
+ *
+ * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
+ * Instructions, ACM Transactions on the Web 12 (3), 2018.
+ * https://arxiv.org/abs/1704.00605
+ *
+ * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
+ * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
+ * Request for Comments: 4648.
+ *
+ * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
+ * http://www.alfredklomp.com/programming/sse-base64/. (2014).
+ *
+ * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
+ * acceleration. https://github.com/aklomp/base64. (2014).
+ *
+ * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
+ * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
+ *
+ * Nick Kopp. 2013. Base64 Encoding on a GPU.
+ * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
+ */
 
-    We need 256-entry lookup table to get a compression pattern
-    and the number of output bytes in the compressed vector register.
-    Each entry occupies 17 bytes.
+struct block64 {
+  __m512i chunks[1];
+};
+
+template <bool base64_url>
+size_t encode_base64(char *dst, const char *src, size_t srclen,
+                     base64_options options) {
+  // credit: Wojciech Muła
+  const uint8_t *input = (const uint8_t *)src;
+
+  uint8_t *out = (uint8_t *)dst;
+  static const char *lookup_tbl =
+      base64_url
+          ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
+          : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+  const __m512i shuffle_input = _mm512_setr_epi32(
+      0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, 0x0d0e0c0d, 0x10110f10,
+      0x13141213, 0x16171516, 0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122,
+      0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e);
+  const __m512i lookup =
+      _mm512_loadu_si512(reinterpret_cast<const __m512i *>(lookup_tbl));
+  const __m512i multi_shifts = _mm512_set1_epi64(UINT64_C(0x3036242a1016040a));
+  size_t size = srclen;
+  __mmask64 input_mask = 0xffffffffffff; // (1 << 48) - 1
+  while (size >= 48) {
+    const __m512i v = _mm512_maskz_loadu_epi8(
+        input_mask, reinterpret_cast<const __m512i *>(input));
+    const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v);
+    const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in);
+    const __m512i result = _mm512_permutexvar_epi8(indices, lookup);
+    _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), result);
+    out += 64;
+    input += 48;
+    size -= 48;
+  }
+  input_mask = ((__mmask64)1 << size) - 1;
+  const __m512i v = _mm512_maskz_loadu_epi8(
+      input_mask, reinterpret_cast<const __m512i *>(input));
+  const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v);
+  const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in);
+  bool padding_needed =
+      (((options & base64_url) == 0) ^
+       ((options & base64_reverse_padding) == base64_reverse_padding));
+  size_t padding_amount = ((size % 3) > 0) ? (3 - (size % 3)) : 0;
+  size_t output_len = ((size + 2) / 3) * 4;
+  size_t non_padded_output_len = output_len - padding_amount;
+  if (!padding_needed) {
+    output_len = non_padded_output_len;
+  }
+  __mmask64 output_mask = output_len == 64 ? (__mmask64)UINT64_MAX
+                                           : ((__mmask64)1 << output_len) - 1;
+  __m512i result = _mm512_mask_permutexvar_epi8(
+      _mm512_set1_epi8('='), ((__mmask64)1 << non_padded_output_len) - 1,
+      indices, lookup);
+  _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out), output_mask,
+                          result);
+  return (size_t)(out - (uint8_t *)dst) + output_len;
+}
+
+template <bool base64_url>
+static inline uint64_t to_base64_mask(block64 *b, uint64_t *error,
+                                      uint64_t input_mask = UINT64_MAX) {
+  __m512i input = b->chunks[0];
+  const __m512i ascii_space_tbl = _mm512_set_epi8(
+      0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10,
+      9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0,
+      0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32);
+  __m512i lookup0;
+  if (base64_url) {
+    lookup0 = _mm512_set_epi8(
+        -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
+        52, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128, -128,
+        -128, -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128,
+        -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1,
+        -128, -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -1);
+  } else {
+    lookup0 = _mm512_set_epi8(
+        -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
+        52, 63, -128, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128,
+        -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, -128,
+        -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128,
+        -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -128);
+  }
+  __m512i lookup1;
+  if (base64_url) {
+    lookup1 = _mm512_set_epi8(
+        -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
+        41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
+        63, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
+        14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
+  } else {
+    lookup1 = _mm512_set_epi8(
+        -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
+        41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
+        -128, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
+        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
+  }
+
+  const __m512i translated = _mm512_permutex2var_epi8(lookup0, input, lookup1);
+  const __m512i combined = _mm512_or_si512(translated, input);
+  const __mmask64 mask = _mm512_movepi8_mask(combined) & input_mask;
+  if (mask) {
+    const __mmask64 spaces =
+        _mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(ascii_space_tbl, input),
+                               input) &
+        input_mask;
+    *error = (mask ^ spaces);
+  }
+  b->chunks[0] = translated;
+
+  return mask | (~input_mask);
+}
 
-    Ad 2.
+static inline void copy_block(block64 *b, char *output) {
+  _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), b->chunks[0]);
+}
 
-    When values fit in 16-bit code units, but are above 0x07ff, then
-    a single word may produce one, two or three UTF8 bytes.
+static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
+  uint64_t nmask = ~mask;
+  __m512i c = _mm512_maskz_compress_epi8(nmask, b->chunks[0]);
+  _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), c);
+  return _mm_popcnt_u64(nmask);
+}
 
-    We prepare data for all these three cases in two registers.
-    The first register contains lower two UTF8 bytes (used in all
-    cases), while the second one contains just the third byte for
-    the three-UTF8-bytes case.
+// The caller of this function is responsible to ensure that there are 64 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char *src) {
+  b->chunks[0] = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
+}
 
-    Finally these two registers are interleaved forming eight-element
-    array of 32-bit values. The array spans two SSE registers.
-    The bytes from the registers are compressed using two shuffles.
+static inline void load_block_partial(block64 *b, const char *src,
+                                      __mmask64 input_mask) {
+  b->chunks[0] = _mm512_maskz_loadu_epi8(
+      input_mask, reinterpret_cast<const __m512i *>(src));
+}
 
-    We need 256-entry lookup table to get a compression pattern
-    and the number of output bytes in the compressed vector register.
-    Each entry occupies 17 bytes.
+// The caller of this function is responsible to ensure that there are 128 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char16_t *src) {
+  __m512i m1 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
+  __m512i m2 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src + 32));
+  __m512i p = _mm512_packus_epi16(m1, m2);
+  b->chunks[0] =
+      _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p);
+}
 
+static inline void load_block_partial(block64 *b, const char16_t *src,
+                                      __mmask64 input_mask) {
+  __m512i m1 = _mm512_maskz_loadu_epi16((__mmask32)input_mask,
+                                        reinterpret_cast<const __m512i *>(src));
+  __m512i m2 =
+      _mm512_maskz_loadu_epi16((__mmask32)(input_mask >> 32),
+                               reinterpret_cast<const __m512i *>(src + 32));
+  __m512i p = _mm512_packus_epi16(m1, m2);
+  b->chunks[0] =
+      _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p);
+}
 
-    To summarize:
-    - We need two 256-entry tables that have 8704 bytes in total.
-*/
-/*
-  Returns a pair: the first unprocessed byte from buf and utf8_output
-  A scalar routing should carry on the conversion of the tail.
-*/
-template <endianness big_endian>
-std::pair<const char16_t *, char *>
-arm_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) {
-  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
-  const char16_t *end = buf + len;
+static inline void base64_decode(char *out, __m512i str) {
+  const __m512i merge_ab_and_bc =
+      _mm512_maddubs_epi16(str, _mm512_set1_epi32(0x01400140));
+  const __m512i merged =
+      _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000));
+  const __m512i pack = _mm512_set_epi8(
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 61, 62, 56, 57, 58,
+      52, 53, 54, 48, 49, 50, 44, 45, 46, 40, 41, 42, 36, 37, 38, 32, 33, 34,
+      28, 29, 30, 24, 25, 26, 20, 21, 22, 16, 17, 18, 12, 13, 14, 8, 9, 10, 4,
+      5, 6, 0, 1, 2);
+  const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged);
+  _mm512_mask_storeu_epi8(
+      (__m512i *)out, 0xffffffffffff,
+      shuffled); // mask would be 0xffffffffffff since we write 48 bytes.
+}
+// decode 64 bytes and output 48 bytes
+static inline void base64_decode_block(char *out, const char *src) {
+  base64_decode(out,
+                _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src)));
+}
+static inline void base64_decode_block(char *out, block64 *b) {
+  base64_decode(out, b->chunks[0]);
+}
 
-  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
-  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
-  const size_t safety_margin =
-      12; // to avoid overruns, see issue
-          // https://github.com/simdutf/simdutf/issues/92
-  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
-    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
-    if (!match_system(big_endian)) {
-      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
+template <bool base64_url, typename chartype>
+full_result
+compress_decode_base64(char *dst, const chartype *src, size_t srclen,
+                       base64_options options,
+                       last_chunk_handling_options last_chunk_options) {
+  (void)options;
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+                                        : tables::base64::to_base64_value;
+  size_t equallocation =
+      srclen; // location of the first padding character if any
+  size_t equalsigns = 0;
+  // skip trailing spaces
+  while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+         to_base64[uint8_t(src[srclen - 1])] == 64) {
+    srclen--;
+  }
+  if (srclen > 0 && src[srclen - 1] == '=') {
+    equallocation = srclen - 1;
+    srclen--;
+    equalsigns = 1;
+    // skip trailing spaces
+    while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+           to_base64[uint8_t(src[srclen - 1])] == 64) {
+      srclen--;
     }
-    if (vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
-      // It is common enough that we have sequences of 16 consecutive ASCII
-      // characters.
-      uint16x8_t nextin =
-          vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
-      if (!match_system(big_endian)) {
-        nextin = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(nextin)));
+    if (srclen > 0 && src[srclen - 1] == '=') {
+      equallocation = srclen - 1;
+      srclen--;
+      equalsigns = 2;
+    }
+  }
+  if (srclen == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation, 0};
+    }
+    return {SUCCESS, 0, 0};
+  }
+  const chartype *const srcinit = src;
+  const char *const dstinit = dst;
+  const chartype *const srcend = src + srclen;
+
+  // figure out why block_size == 2 is sometimes best???
+  constexpr size_t block_size = 6;
+  char buffer[block_size * 64];
+  char *bufferptr = buffer;
+  if (srclen >= 64) {
+    const chartype *const srcend64 = src + srclen - 64;
+    while (src <= srcend64) {
+      block64 b;
+      load_block(&b, src);
+      src += 64;
+      uint64_t error = 0;
+      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
+      if (error) {
+        src -= 64;
+        size_t error_offset = _tzcnt_u64(error);
+        return {error_code::INVALID_BASE64_CHARACTER,
+                size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
       }
-      if (vmaxvq_u16(nextin) > 0x7F) {
-        // 1. pack the bytes
-        // obviously suboptimal.
-        uint8x8_t utf8_packed = vmovn_u16(in);
-        // 2. store (8 bytes)
-        vst1_u8(utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 8;
-        utf8_output += 8;
-        in = nextin;
+      if (badcharmask != 0) {
+        // optimization opportunity: check for simple masks like those made of
+        // continuous 1s followed by continuous 0s. And masks containing a
+        // single bad character.
+        bufferptr += compress_block(&b, badcharmask, bufferptr);
+      } else if (bufferptr != buffer) {
+        copy_block(&b, bufferptr);
+        bufferptr += 64;
       } else {
-        // 1. pack the bytes
-        // obviously suboptimal.
-        uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
-        // 2. store (16 bytes)
-        vst1q_u8(utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 16;
-        utf8_output += 16;
-        continue; // we are done for this round!
+        base64_decode_block(dst, &b);
+        dst += 48;
+      }
+      if (bufferptr >= (block_size - 1) * 64 + buffer) {
+        for (size_t i = 0; i < (block_size - 1); i++) {
+          base64_decode_block(dst, buffer + i * 64);
+          dst += 48;
+        }
+        std::memcpy(buffer, buffer + (block_size - 1) * 64,
+                    64); // 64 might be too much
+        bufferptr -= (block_size - 1) * 64;
       }
     }
+  }
 
-    if (vmaxvq_u16(in) <= 0x7FF) {
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
-      const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const uint16x8_t t0 = vshlq_n_u16(in, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const uint16x8_t t1 = vandq_u16(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const uint16x8_t t2 = vandq_u16(in, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const uint16x8_t t3 = vorrq_u16(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const uint16x8_t t4 = vorrq_u16(t3, v_c080);
-      // 2. merge ASCII and 2-byte codewords
-      const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-      const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
-      const uint8x16_t utf8_unpacked =
-          vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
-      // 3. prepare bitmask for 8-bit lookup
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint16x8_t mask = simdutf_make_uint16x8_t(
-          0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
-#else
-      const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
-                               0x0002, 0x0008, 0x0020, 0x0080};
-#endif
-      uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
-      // 4. pack the bytes
-      const uint8_t *row =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-      const uint8x16_t shuffle = vld1q_u8(row + 1);
-      const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
-
-      // 5. store bytes
-      vst1q_u8(utf8_output, utf8_packed);
-
-      // 6. adjust pointers
-      buf += 8;
-      utf8_output += row[0];
-      continue;
+  int last_block_len = (int)(srcend - src);
+  if (last_block_len != 0) {
+    __mmask64 input_mask = ((__mmask64)1 << last_block_len) - 1;
+    block64 b;
+    load_block_partial(&b, src, input_mask);
+    uint64_t error = 0;
+    uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error, input_mask);
+    if (error) {
+      size_t error_offset = _tzcnt_u64(error);
+      return {error_code::INVALID_BASE64_CHARACTER,
+              size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
     }
-    const uint16x8_t surrogates_bytemask =
-        vceqq_u16(vandq_u16(in, v_f800), v_d800);
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
-    // However, it is likely an uncommon occurrence.
-    if (vmaxvq_u16(surrogates_bytemask) == 0) {
-      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint16x8_t dup_even = simdutf_make_uint16x8_t(
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-#else
-      const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
-                                   0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
-#endif
-      /* In this branch we handle three cases:
-         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
-        single UFT-8 byte
-         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
-        UTF-8 bytes
-         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
-        three UTF-8 bytes
+    src += last_block_len;
+    bufferptr += compress_block(&b, badcharmask, bufferptr);
+  }
 
-        We expand the input word (16-bit) into two code units (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
+  char *buffer_start = buffer;
+  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
+    base64_decode_block(dst, buffer_start);
+    dst += 48;
+  }
 
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
+  if ((bufferptr - buffer_start) != 0) {
+    size_t rem = (bufferptr - buffer_start);
+    int idx = rem % 4;
+    __mmask64 mask = ((__mmask64)1 << rem) - 1;
+    __m512i input = _mm512_maskz_loadu_epi8(mask, buffer_start);
+    size_t output_len = (rem / 4) * 3;
+    __mmask64 output_mask = mask >> (rem - output_len);
+    const __m512i merge_ab_and_bc =
+        _mm512_maddubs_epi16(input, _mm512_set1_epi32(0x01400140));
+    const __m512i merged =
+        _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000));
+    const __m512i pack = _mm512_set_epi8(
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 61, 62, 56, 57, 58,
+        52, 53, 54, 48, 49, 50, 44, 45, 46, 40, 41, 42, 36, 37, 38, 32, 33, 34,
+        28, 29, 30, 24, 25, 26, 20, 21, 22, 16, 17, 18, 12, 13, 14, 8, 9, 10, 4,
+        5, 6, 0, 1, 2);
+    const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged);
+
+    if (last_chunk_options == last_chunk_handling_options::strict &&
+        (idx != 1) && ((idx + equalsigns) & 3) != 0) {
+      // The partial chunk was at src - idx
+      _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
+      dst += output_len;
+      return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
+              size_t(dst - dstinit)};
+    } else if (last_chunk_options ==
+                   last_chunk_handling_options::stop_before_partial &&
+               (idx != 1) && ((idx + equalsigns) & 3) != 0) {
+      // Rewind src to before partial chunk
+      _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
+      dst += output_len;
+      src -= idx;
+    } else {
+      if (idx == 2) {
+        if (last_chunk_options == last_chunk_handling_options::strict) {
+          uint32_t triple = (uint32_t(bufferptr[-2]) << 3 * 6) +
+                            (uint32_t(bufferptr[-1]) << 2 * 6);
+          if (triple & 0xffff) {
+            _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
+            dst += output_len;
+            return {BASE64_EXTRA_BITS, size_t(src - srcinit),
+                    size_t(dst - dstinit)};
+          }
+        }
+        output_mask = (output_mask << 1) | 1;
+        output_len += 1;
+        _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
+        dst += output_len;
+      } else if (idx == 3) {
+        if (last_chunk_options == last_chunk_handling_options::strict) {
+          uint32_t triple = (uint32_t(bufferptr[-3]) << 3 * 6) +
+                            (uint32_t(bufferptr[-2]) << 2 * 6) +
+                            (uint32_t(bufferptr[-1]) << 1 * 6);
+          if (triple & 0xff) {
+            _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
+            dst += output_len;
+            return {BASE64_EXTRA_BITS, size_t(src - srcinit),
+                    size_t(dst - dstinit)};
+          }
+        }
+        output_mask = (output_mask << 2) | 3;
+        output_len += 2;
+        _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
+        dst += output_len;
+      } else if (idx == 1) {
+        _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
+        dst += output_len;
+        return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
+                size_t(dst - dstinit)};
+      } else {
+        _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
+        dst += output_len;
+      }
+    }
 
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
+    if (last_chunk_options != stop_before_partial && equalsigns > 0) {
+      size_t output_count = size_t(dst - dstinit);
+      if ((output_count % 3 == 0) ||
+          ((output_count % 3) + 1 + equalsigns != 4)) {
+        return {INVALID_BASE64_CHARACTER, equallocation, output_count};
+      }
+    }
 
-        Finally from these two code units we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const uint16x8_t t0 = vreinterpretq_u16_u8(
-          vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+    return {SUCCESS, srclen, size_t(dst - dstinit)};
+  }
 
-      // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
-      const uint16x8_t s0 = vshrq_n_u16(in, 12);
-      // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
-      const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
-      // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-      const uint16x8_t s1s = vshlq_n_u16(s1, 2);
-      // [00bb|bbbb|0000|aaaa]
-      const uint16x8_t s2 = vorrq_u16(s0, s1s);
-      // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
-      const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
-      const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
-      const uint16x8_t m0 =
-          vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
-      const uint16x8_t s4 = veorq_u16(s3, m0);
-#undef simdutf_vec
+  if (equalsigns > 0) {
+    if ((size_t(dst - dstinit) % 3 == 0) ||
+        ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
+    }
+  }
+  return {SUCCESS, srclen, size_t(dst - dstinit)};
+}
+/* end file src/icelake/icelake_base64.inl.cpp */
 
-      // 4. expand code units 16-bit => 32-bit
-      const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
-      const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+#include <cstdint>
 
-      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-      const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint16x8_t onemask = simdutf_make_uint16x8_t(
-          0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
-      const uint16x8_t twomask = simdutf_make_uint16x8_t(
-          0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
-#else
-      const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
-                                  0x0100, 0x0400, 0x1000, 0x4000};
-      const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
-                                  0x0200, 0x0800, 0x2000, 0x8000};
-#endif
-      const uint16x8_t combined =
-          vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
-                    vandq_u16(one_or_two_bytes_bytemask, twomask));
-      const uint16_t mask = vaddvq_u16(combined);
-      // The following fast path may or may not be beneficial.
-      /*if(mask == 0) {
-        // We only have three-byte code units. Use fast path.
-        const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
-        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
-        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
-        vst1q_u8(utf8_output, utf8_0);
-        utf8_output += 12;
-        vst1q_u8(utf8_output, utf8_1);
-        utf8_output += 12;
-        buf += 8;
-        continue;
-      }*/
-      const uint8_t mask0 = uint8_t(mask);
+} // namespace
+} // namespace icelake
+} // namespace simdutf
 
-      const uint8_t *row0 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
-      const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+namespace simdutf {
+namespace icelake {
 
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-      const uint8_t *row1 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
-      const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+                                 size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  // todo: convert to a one-pass algorithm
+  if (bom_encoding != encoding_type::unspecified) {
+    return bom_encoding;
+  }
+  int out = 0;
+  if (validate_utf8(input, length)) {
+    out |= encoding_type::UTF8;
+  }
+  if ((length % 2) == 0) {
+    if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
+                         length / 2)) {
+      out |= encoding_type::UTF16_LE;
+    }
+  }
+  if ((length % 4) == 0) {
+    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+      out |= encoding_type::UTF32_LE;
+    }
+  }
+  return out;
+}
 
-      vst1q_u8(utf8_output, utf8_0);
-      utf8_output += row0[0];
-      vst1q_u8(utf8_output, utf8_1);
-      utf8_output += row1[0];
+simdutf_warn_unused bool
+implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return true;
+  }
+  avx512_utf8_checker checker{};
+  const char *ptr = buf;
+  const char *end = ptr + len;
+  for (; end - ptr >= 64; ptr += 64) {
+    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+    checker.check_next_input(utf8);
+  }
+  if (end != ptr) {
+    const __m512i utf8 = _mm512_maskz_loadu_epi8(
+        ~UINT64_C(0) >> (64 - (end - ptr)), (const __m512i *)ptr);
+    checker.check_next_input(utf8);
+  }
+  checker.check_eof();
+  return !checker.errors();
+}
 
-      buf += 8;
-      // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint16_t word = !match_system(big_endian)
-                            ? scalar::utf16::swap_bytes(buf[k])
-                            : buf[k];
-        if ((word & 0xFF80) == 0) {
-          *utf8_output++ = char(word);
-        } else if ((word & 0xF800) == 0) {
-          *utf8_output++ = char((word >> 6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if ((word & 0xF800) != 0xD800) {
-          *utf8_output++ = char((word >> 12) | 0b11100000);
-          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = !match_system(big_endian)
-                                   ? scalar::utf16::swap_bytes(buf[k + 1])
-                                   : buf[k + 1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if ((diff | diff2) > 0x3FF) {
-            return std::make_pair(nullptr,
-                                  reinterpret_cast<char *>(utf8_output));
-          }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf8_output++ = char((value >> 18) | 0b11110000);
-          *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((value & 0b111111) | 0b10000000);
-        }
-      }
-      buf += k;
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+    const char *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, len);
+  }
+  avx512_utf8_checker checker{};
+  const char *ptr = buf;
+  const char *end = ptr + len;
+  size_t count{0};
+  for (; end - ptr >= 64; ptr += 64) {
+    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+    checker.check_next_input(utf8);
+    if (checker.errors()) {
+      if (count != 0) {
+        count--;
+      } // Sometimes the error is only detected in the next chunk
+      result res = scalar::utf8::rewind_and_validate_with_errors(
+          reinterpret_cast<const char *>(buf),
+          reinterpret_cast<const char *>(buf + count), len - count);
+      res.count += count;
+      return res;
     }
-  } // while
+    count += 64;
+  }
+  if (end != ptr) {
+    const __m512i utf8 = _mm512_maskz_loadu_epi8(
+        ~UINT64_C(0) >> (64 - (end - ptr)), (const __m512i *)ptr);
+    checker.check_next_input(utf8);
+  }
+  checker.check_eof();
+  if (checker.errors()) {
+    if (count != 0) {
+      count--;
+    } // Sometimes the error is only detected in the next chunk
+    result res = scalar::utf8::rewind_and_validate_with_errors(
+        reinterpret_cast<const char *>(buf),
+        reinterpret_cast<const char *>(buf + count), len - count);
+    res.count += count;
+    return res;
+  }
+  return result(error_code::SUCCESS, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+  return icelake::validate_ascii(buf, len);
+}
 
-  return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+    const char *buf, size_t len) const noexcept {
+  const char *buf_orig = buf;
+  const char *end = buf + len;
+  const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
+  for (; end - buf >= 64; buf += 64) {
+    const __m512i input = _mm512_loadu_si512((const __m512i *)buf);
+    __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
+    if (notascii) {
+      return result(error_code::TOO_LARGE,
+                    buf - buf_orig + _tzcnt_u64(notascii));
+    }
+  }
+  if (end != buf) {
+    const __m512i input = _mm512_maskz_loadu_epi8(
+        ~UINT64_C(0) >> (64 - (end - buf)), (const __m512i *)buf);
+    __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
+    if (notascii) {
+      return result(error_code::TOO_LARGE,
+                    buf - buf_orig + _tzcnt_u64(notascii));
+    }
+  }
+  return result(error_code::SUCCESS, len);
 }
 
-/*
-  Returns a pair: a result struct and utf8_output.
-  If there is an error, the count field of the result is the position of the
-  error. Otherwise, it is the position of the first unprocessed byte in buf
-  (even if finished). A scalar routing should carry on the conversion of the
-  tail if needed.
-*/
-template <endianness big_endian>
-std::pair<result, char *>
-arm_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
-                                      char *utf8_out) {
-  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
-  const char16_t *start = buf;
+simdutf_warn_unused bool
+implementation::validate_utf16le(const char16_t *buf,
+                                 size_t len) const noexcept {
   const char16_t *end = buf + len;
 
-  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
-  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
-  const size_t safety_margin =
-      12; // to avoid overruns, see issue
-          // https://github.com/simdutf/simdutf/issues/92
-
-  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
-    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
-    if (!match_system(big_endian)) {
-      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
-    }
-    if (vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
-      // It is common enough that we have sequences of 16 consecutive ASCII
-      // characters.
-      uint16x8_t nextin =
-          vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
-      if (!match_system(big_endian)) {
-        nextin = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(nextin)));
+  for (; end - buf >= 32;) {
+    __m512i in = _mm512_loadu_si512((__m512i *)buf);
+    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+    __mmask32 surrogates =
+        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+    if (surrogates) {
+      __mmask32 highsurrogates =
+          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+      // high must be followed by low
+      if ((highsurrogates << 1) != lowsurrogates) {
+        return false;
       }
-      if (vmaxvq_u16(nextin) > 0x7F) {
-        // 1. pack the bytes
-        // obviously suboptimal.
-        uint8x8_t utf8_packed = vmovn_u16(in);
-        // 2. store (8 bytes)
-        vst1_u8(utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 8;
-        utf8_output += 8;
-        in = nextin;
+      bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+      if (ends_with_high) {
+        buf += 31; // advance only by 31 code units so that we start with the
+                   // high surrogate on the next round.
       } else {
-        // 1. pack the bytes
-        // obviously suboptimal.
-        uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
-        // 2. store (16 bytes)
-        vst1q_u8(utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 16;
-        utf8_output += 16;
-        continue; // we are done for this round!
+        buf += 32;
       }
+    } else {
+      buf += 32;
     }
-
-    if (vmaxvq_u16(in) <= 0x7FF) {
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
-      const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const uint16x8_t t0 = vshlq_n_u16(in, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const uint16x8_t t1 = vandq_u16(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const uint16x8_t t2 = vandq_u16(in, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const uint16x8_t t3 = vorrq_u16(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const uint16x8_t t4 = vorrq_u16(t3, v_c080);
-      // 2. merge ASCII and 2-byte codewords
-      const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-      const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
-      const uint8x16_t utf8_unpacked =
-          vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
-      // 3. prepare bitmask for 8-bit lookup
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint16x8_t mask = simdutf_make_uint16x8_t(
-          0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
-#else
-      const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
-                               0x0002, 0x0008, 0x0020, 0x0080};
-#endif
-      uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
-      // 4. pack the bytes
-      const uint8_t *row =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-      const uint8x16_t shuffle = vld1q_u8(row + 1);
-      const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
-
-      // 5. store bytes
-      vst1q_u8(utf8_output, utf8_packed);
-
-      // 6. adjust pointers
-      buf += 8;
-      utf8_output += row[0];
-      continue;
+  }
+  if (buf < end) {
+    __m512i in =
+        _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf);
+    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+    __mmask32 surrogates =
+        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+    if (surrogates) {
+      __mmask32 highsurrogates =
+          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+      // high must be followed by low
+      if ((highsurrogates << 1) != lowsurrogates) {
+        return false;
+      }
     }
-    const uint16x8_t surrogates_bytemask =
-        vceqq_u16(vandq_u16(in, v_f800), v_d800);
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
-    // However, it is likely an uncommon occurrence.
-    if (vmaxvq_u16(surrogates_bytemask) == 0) {
-      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint16x8_t dup_even = simdutf_make_uint16x8_t(
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-#else
-      const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
-                                   0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
-#endif
-      /* In this branch we handle three cases:
-         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
-        single UFT-8 byte
-         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
-        UTF-8 bytes
-         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
-        three UTF-8 bytes
-
-        We expand the input word (16-bit) into two code units (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
-
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
-
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
-
-        Finally from these two code units we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const uint16x8_t t0 = vreinterpretq_u16_u8(
-          vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+  }
+  return true;
+}
 
-      // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
-      const uint16x8_t s0 = vshrq_n_u16(in, 12);
-      // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
-      const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
-      // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-      const uint16x8_t s1s = vshlq_n_u16(s1, 2);
-      // [00bb|bbbb|0000|aaaa]
-      const uint16x8_t s2 = vorrq_u16(s0, s1s);
-      // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
-      const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
-      const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
-      const uint16x8_t m0 =
-          vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
-      const uint16x8_t s4 = veorq_u16(s3, m0);
-#undef simdutf_vec
+simdutf_warn_unused bool
+implementation::validate_utf16be(const char16_t *buf,
+                                 size_t len) const noexcept {
+  const char16_t *end = buf + len;
+  const __m512i byteflip = _mm512_setr_epi64(
+      0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
+      0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+      0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  for (; end - buf >= 32;) {
+    __m512i in =
+        _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i *)buf), byteflip);
+    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+    __mmask32 surrogates =
+        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+    if (surrogates) {
+      __mmask32 highsurrogates =
+          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+      // high must be followed by low
+      if ((highsurrogates << 1) != lowsurrogates) {
+        return false;
+      }
+      bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+      if (ends_with_high) {
+        buf += 31; // advance only by 31 code units so that we start with the
+                   // high surrogate on the next round.
+      } else {
+        buf += 32;
+      }
+    } else {
+      buf += 32;
+    }
+  }
+  if (buf < end) {
+    __m512i in = _mm512_shuffle_epi8(
+        _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf),
+        byteflip);
+    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+    __mmask32 surrogates =
+        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+    if (surrogates) {
+      __mmask32 highsurrogates =
+          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+      // high must be followed by low
+      if ((highsurrogates << 1) != lowsurrogates) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
 
-      // 4. expand code units 16-bit => 32-bit
-      const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
-      const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  const char16_t *start_buf = buf;
+  const char16_t *end = buf + len;
+  for (; end - buf >= 32;) {
+    __m512i in = _mm512_loadu_si512((__m512i *)buf);
+    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+    __mmask32 surrogates =
+        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+    if (surrogates) {
+      __mmask32 highsurrogates =
+          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+      // high must be followed by low
+      if ((highsurrogates << 1) != lowsurrogates) {
+        uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
+        uint32_t extra_high =
+            _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
+        return result(error_code::SURROGATE,
+                      (buf - start_buf) +
+                          (extra_low < extra_high ? extra_low : extra_high));
+      }
+      bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+      if (ends_with_high) {
+        buf += 31; // advance only by 31 code units so that we start with the
+                   // high surrogate on the next round.
+      } else {
+        buf += 32;
+      }
+    } else {
+      buf += 32;
+    }
+  }
+  if (buf < end) {
+    __m512i in =
+        _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf);
+    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+    __mmask32 surrogates =
+        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+    if (surrogates) {
+      __mmask32 highsurrogates =
+          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+      // high must be followed by low
+      if ((highsurrogates << 1) != lowsurrogates) {
+        uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
+        uint32_t extra_high =
+            _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
+        return result(error_code::SURROGATE,
+                      (buf - start_buf) +
+                          (extra_low < extra_high ? extra_low : extra_high));
+      }
+    }
+  }
+  return result(error_code::SUCCESS, len);
+}
 
-      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-      const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint16x8_t onemask = simdutf_make_uint16x8_t(
-          0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
-      const uint16x8_t twomask = simdutf_make_uint16x8_t(
-          0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
-#else
-      const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
-                                  0x0100, 0x0400, 0x1000, 0x4000};
-      const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
-                                  0x0200, 0x0800, 0x2000, 0x8000};
-#endif
-      const uint16x8_t combined =
-          vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
-                    vandq_u16(one_or_two_bytes_bytemask, twomask));
-      const uint16_t mask = vaddvq_u16(combined);
-      // The following fast path may or may not be beneficial.
-      /*if(mask == 0) {
-        // We only have three-byte code units. Use fast path.
-        const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
-        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
-        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
-        vst1q_u8(utf8_output, utf8_0);
-        utf8_output += 12;
-        vst1q_u8(utf8_output, utf8_1);
-        utf8_output += 12;
-        buf += 8;
-        continue;
-      }*/
-      const uint8_t mask0 = uint8_t(mask);
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  const char16_t *start_buf = buf;
+  const char16_t *end = buf + len;
+  const __m512i byteflip = _mm512_setr_epi64(
+      0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
+      0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+      0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  for (; end - buf >= 32;) {
+    __m512i in =
+        _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i *)buf), byteflip);
+    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+    __mmask32 surrogates =
+        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+    if (surrogates) {
+      __mmask32 highsurrogates =
+          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+      // high must be followed by low
+      if ((highsurrogates << 1) != lowsurrogates) {
+        uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
+        uint32_t extra_high =
+            _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
+        return result(error_code::SURROGATE,
+                      (buf - start_buf) +
+                          (extra_low < extra_high ? extra_low : extra_high));
+      }
+      bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+      if (ends_with_high) {
+        buf += 31; // advance only by 31 code units so that we start with the
+                   // high surrogate on the next round.
+      } else {
+        buf += 32;
+      }
+    } else {
+      buf += 32;
+    }
+  }
+  if (buf < end) {
+    __m512i in = _mm512_shuffle_epi8(
+        _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf),
+        byteflip);
+    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+    __mmask32 surrogates =
+        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+    if (surrogates) {
+      __mmask32 highsurrogates =
+          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+      // high must be followed by low
+      if ((highsurrogates << 1) != lowsurrogates) {
+        uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
+        uint32_t extra_high =
+            _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
+        return result(error_code::SURROGATE,
+                      (buf - start_buf) +
+                          (extra_low < extra_high ? extra_low : extra_high));
+      }
+    }
+  }
+  return result(error_code::SUCCESS, len);
+}
 
-      const uint8_t *row0 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
-      const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+simdutf_warn_unused bool
+implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+  const char32_t *tail = icelake::validate_utf32(buf, len);
+  if (tail) {
+    return scalar::utf32::validate(tail, len - (tail - buf));
+  } else {
+    // we come here if there was an error, or buf was nullptr which may happen
+    // for empty input.
+    return len == 0;
+  }
+}
 
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-      const uint8_t *row1 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
-      const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+simdutf_warn_unused result implementation::validate_utf32_with_errors(
+    const char32_t *buf, size_t len) const noexcept {
+  const char32_t *buf_orig = buf;
+  if (len >= 16) {
+    const char32_t *end = buf + len - 16;
+    while (buf <= end) {
+      __m512i utf32 = _mm512_loadu_si512((const __m512i *)buf);
+      __mmask16 outside_range = _mm512_cmp_epu32_mask(
+          utf32, _mm512_set1_epi32(0x10ffff), _MM_CMPINT_GT);
 
-      vst1q_u8(utf8_output, utf8_0);
-      utf8_output += row0[0];
-      vst1q_u8(utf8_output, utf8_1);
-      utf8_output += row1[0];
+      __m512i utf32_off =
+          _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
 
-      buf += 8;
-      // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint16_t word = !match_system(big_endian)
-                            ? scalar::utf16::swap_bytes(buf[k])
-                            : buf[k];
-        if ((word & 0xFF80) == 0) {
-          *utf8_output++ = char(word);
-        } else if ((word & 0xF800) == 0) {
-          *utf8_output++ = char((word >> 6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if ((word & 0xF800) != 0xD800) {
-          *utf8_output++ = char((word >> 12) | 0b11100000);
-          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = !match_system(big_endian)
-                                   ? scalar::utf16::swap_bytes(buf[k + 1])
-                                   : buf[k + 1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if ((diff | diff2) > 0x3FF) {
-            return std::make_pair(
-                result(error_code::SURROGATE, buf - start + k - 1),
-                reinterpret_cast<char *>(utf8_output));
-          }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf8_output++ = char((value >> 18) | 0b11110000);
-          *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+      __mmask16 surrogate_range = _mm512_cmp_epu32_mask(
+          utf32_off, _mm512_set1_epi32(0xfffff7ff), _MM_CMPINT_GT);
+      if ((outside_range | surrogate_range)) {
+        auto outside_idx = _tzcnt_u32(outside_range);
+        auto surrogate_idx = _tzcnt_u32(surrogate_range);
+
+        if (outside_idx < surrogate_idx) {
+          return result(error_code::TOO_LARGE, buf - buf_orig + outside_idx);
         }
+
+        return result(error_code::SURROGATE, buf - buf_orig + surrogate_idx);
       }
-      buf += k;
+
+      buf += 16;
     }
-  } // while
+  }
+  if (len > 0) {
+    __m512i utf32 = _mm512_maskz_loadu_epi32(
+        __mmask16((1U << (buf_orig + len - buf)) - 1), (const __m512i *)buf);
+    __mmask16 outside_range = _mm512_cmp_epu32_mask(
+        utf32, _mm512_set1_epi32(0x10ffff), _MM_CMPINT_GT);
+    __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start),
-                        reinterpret_cast<char *>(utf8_output));
+    __mmask16 surrogate_range = _mm512_cmp_epu32_mask(
+        utf32_off, _mm512_set1_epi32(0xfffff7ff), _MM_CMPINT_GT);
+    if ((outside_range | surrogate_range)) {
+      auto outside_idx = _tzcnt_u32(outside_range);
+      auto surrogate_idx = _tzcnt_u32(surrogate_range);
+
+      if (outside_idx < surrogate_idx) {
+        return result(error_code::TOO_LARGE, buf - buf_orig + outside_idx);
+      }
+
+      return result(error_code::SURROGATE, buf - buf_orig + surrogate_idx);
+    }
+  }
+
+  return result(error_code::SUCCESS, len);
 }
-/* end file src/arm64/arm_convert_utf16_to_utf8.cpp */
 
-/* begin file src/arm64/arm_base64.cpp */
-/**
- * References and further reading:
- *
- * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
- * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
- * https://arxiv.org/abs/1910.05109
- *
- * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
- * Instructions, ACM Transactions on the Web 12 (3), 2018.
- * https://arxiv.org/abs/1704.00605
- *
- * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
- * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
- * Request for Comments: 4648.
- *
- * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
- * http://www.alfredklomp.com/programming/sse-base64/. (2014).
- *
- * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
- * acceleration. https://github.com/aklomp/base64. (2014).
- *
- * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
- * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
- *
- * Nick Kopp. 2013. Base64 Encoding on a GPU.
- * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
- */
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
+    const char *buf, size_t len, char *utf8_output) const noexcept {
+  return icelake::latin1_to_utf8_avx512_start(buf, len, utf8_output);
+}
 
-size_t encode_base64(char *dst, const char *src, size_t srclen,
-                     base64_options options) {
-  // credit: Wojciech Muła
-  uint8_t *out = (uint8_t *)dst;
-  constexpr static uint8_t source_table[64] = {
-      'A', 'Q', 'g', 'w', 'B', 'R', 'h', 'x', 'C', 'S', 'i', 'y', 'D',
-      'T', 'j', 'z', 'E', 'U', 'k', '0', 'F', 'V', 'l', '1', 'G', 'W',
-      'm', '2', 'H', 'X', 'n', '3', 'I', 'Y', 'o', '4', 'J', 'Z', 'p',
-      '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8',
-      'N', 'd', 't', '9', 'O', 'e', 'u', '+', 'P', 'f', 'v', '/',
-  };
-  constexpr static uint8_t source_table_url[64] = {
-      'A', 'Q', 'g', 'w', 'B', 'R', 'h', 'x', 'C', 'S', 'i', 'y', 'D',
-      'T', 'j', 'z', 'E', 'U', 'k', '0', 'F', 'V', 'l', '1', 'G', 'W',
-      'm', '2', 'H', 'X', 'n', '3', 'I', 'Y', 'o', '4', 'J', 'Z', 'p',
-      '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8',
-      'N', 'd', 't', '9', 'O', 'e', 'u', '-', 'P', 'f', 'v', '_',
-  };
-  const uint8x16_t v3f = vdupq_n_u8(0x3f);
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  // When trying to load a uint8_t array, Visual Studio might
-  // error with: error C2664: '__n128x4 neon_ld4m_q8(const char *)':
-  // cannot convert argument 1 from 'const uint8_t [64]' to 'const char *
-  const uint8x16x4_t table = vld4q_u8(
-      (reinterpret_cast<const char *>(options & base64_url) ? source_table_url
-                                                            : source_table));
-#else
-  const uint8x16x4_t table =
-      vld4q_u8((options & base64_url) ? source_table_url : source_table);
-#endif
-  size_t i = 0;
-  for (; i + 16 * 3 <= srclen; i += 16 * 3) {
-    const uint8x16x3_t in = vld3q_u8((const uint8_t *)src + i);
-    uint8x16x4_t result;
-    result.val[0] = vshrq_n_u8(in.val[0], 2);
-    result.val[1] =
-        vandq_u8(vsliq_n_u8(vshrq_n_u8(in.val[1], 4), in.val[0], 4), v3f);
-    result.val[2] =
-        vandq_u8(vsliq_n_u8(vshrq_n_u8(in.val[2], 6), in.val[1], 2), v3f);
-    result.val[3] = vandq_u8(in.val[2], v3f);
-    result.val[0] = vqtbl4q_u8(table, result.val[0]);
-    result.val[1] = vqtbl4q_u8(table, result.val[1]);
-    result.val[2] = vqtbl4q_u8(table, result.val[2]);
-    result.val[3] = vqtbl4q_u8(table, result.val[3]);
-    vst4q_u8(out, result);
-    out += 64;
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return icelake_convert_latin1_to_utf16<endianness::LITTLE>(buf, len,
+                                                             utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return icelake_convert_latin1_to_utf16<endianness::BIG>(buf, len,
+                                                          utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::pair<const char *, char32_t *> ret =
+      avx512_convert_latin1_to_utf32(buf, len, utf32_output);
+  if (ret.first == nullptr) {
+    return 0;
   }
-  out += scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i,
-                                            options);
+  size_t converted_chars = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_converted_chars == 0) {
+      return 0;
+    }
+    converted_chars += scalar_converted_chars;
+  }
+  return converted_chars;
+}
 
-  return size_t((char *)out - dst);
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  return icelake::utf8_to_latin1_avx512(buf, len, latin1_output);
 }
 
-static inline void compress(uint8x16_t data, uint16_t mask, char *output) {
-  if (mask == 0) {
-    vst1q_u8((uint8_t *)output, data);
-    return;
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  // First, try to convert as much as possible using the SIMD implementation.
+  const char *obuf = buf;
+  char *olatin1_output = latin1_output;
+  size_t written = icelake::utf8_to_latin1_avx512(obuf, len, olatin1_output);
+
+  // If we have completely converted the string
+  if (obuf == buf + len) {
+    return {simdutf::SUCCESS, written};
   }
-  uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
-  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
-  uint64x2_t compactmasku64 = {tables::base64::thintable_epi8[mask1],
-                               tables::base64::thintable_epi8[mask2]};
-  uint8x16_t compactmask = vreinterpretq_u8_u64(compactmasku64);
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  const uint8x16_t off =
-      simdutf_make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8);
-#else
-  const uint8x16_t off = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
-#endif
+  size_t pos = obuf - buf;
+  result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+      pos, buf + pos, len - pos, latin1_output);
+  res.count += pos;
+  return res;
+}
 
-  compactmask = vaddq_u8(compactmask, off);
-  uint8x16_t pruned = vqtbl1q_u8(data, compactmask);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  return icelake::valid_utf8_to_latin1_avx512(buf, len, latin1_output);
+}
 
-  int pop1 = tables::base64::BitsSetTable256mul2[mask1];
-  // then load the corresponding mask, what it does is to write
-  // only the first pop1 bytes from the first 8 bytes, and then
-  // it fills in with the bytes from the second 8 bytes + some filling
-  // at the end.
-  compactmask = vld1q_u8(tables::base64::pshufb_combine_table + pop1 * 8);
-  uint8x16_t answer = vqtbl1q_u8(pruned, compactmask);
-  vst1q_u8((uint8_t *)output, answer);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16_result ret =
+      fast_avx512_convert_utf8_to_utf16<endianness::LITTLE>(buf, len,
+                                                            utf16_output);
+  if (ret.second == nullptr) {
+    return 0;
+  }
+  return ret.second - utf16_output;
 }
 
-struct block64 {
-  uint8x16_t chunks[4];
-};
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::BIG>(
+      buf, len, utf16_output);
+  if (ret.second == nullptr) {
+    return 0;
+  }
+  return ret.second - utf16_output;
+}
 
-static_assert(sizeof(block64) == 64, "block64 is not 64 bytes");
-template <bool base64_url> uint64_t to_base64_mask(block64 *b, bool *error) {
-  uint8x16_t v0f = vdupq_n_u8(0xf);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::LITTLE>(
+      buf, len, utf16_output);
+}
 
-  uint8x16_t underscore0, underscore1, underscore2, underscore3;
-  if (base64_url) {
-    underscore0 = vceqq_u8(b->chunks[0], vdupq_n_u8(0x5f));
-    underscore1 = vceqq_u8(b->chunks[1], vdupq_n_u8(0x5f));
-    underscore2 = vceqq_u8(b->chunks[2], vdupq_n_u8(0x5f));
-    underscore3 = vceqq_u8(b->chunks[3], vdupq_n_u8(0x5f));
-  } else {
-    (void)underscore0;
-    (void)underscore1;
-    (void)underscore2;
-    (void)underscore3;
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::BIG>(
+      buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16_result ret =
+      icelake::valid_utf8_to_fixed_length<endianness::LITTLE, char16_t>(
+          buf, len, utf16_output);
+  size_t saved_bytes = ret.second - utf16_output;
+  const char *end = buf + len;
+  if (ret.first == end) {
+    return saved_bytes;
   }
 
-  uint8x16_t lo_nibbles0 = vandq_u8(b->chunks[0], v0f);
-  uint8x16_t lo_nibbles1 = vandq_u8(b->chunks[1], v0f);
-  uint8x16_t lo_nibbles2 = vandq_u8(b->chunks[2], v0f);
-  uint8x16_t lo_nibbles3 = vandq_u8(b->chunks[3], v0f);
+  // Note: AVX512 procedure looks up 4 bytes forward, and
+  //       correctly converts multi-byte chars even if their
+  //       continuation bytes lie outsiede 16-byte window.
+  //       It meas, we have to skip continuation bytes from
+  //       the beginning ret.first, as they were already consumed.
+  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+    ret.first += 1;
+  }
 
-  // Needed by the decoding step.
-  uint8x16_t hi_nibbles0 = vshrq_n_u8(b->chunks[0], 4);
-  uint8x16_t hi_nibbles1 = vshrq_n_u8(b->chunks[1], 4);
-  uint8x16_t hi_nibbles2 = vshrq_n_u8(b->chunks[2], 4);
-  uint8x16_t hi_nibbles3 = vshrq_n_u8(b->chunks[3], 4);
-  uint8x16_t lut_lo;
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  if (base64_url) {
-    lut_lo =
-        simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
-                                0x70, 0x61, 0xe1, 0xf4, 0xe5, 0xa5, 0xf4, 0xf4);
-  } else {
-    lut_lo =
-        simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
-                                0x70, 0x61, 0xe1, 0xb4, 0xe5, 0xe5, 0xf4, 0xb4);
+  if (ret.first != end) {
+    const size_t scalar_saved_bytes =
+        scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
   }
-#else
-  if (base64_url) {
-    lut_lo = uint8x16_t{0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
-                        0x70, 0x61, 0xe1, 0xf4, 0xe5, 0xa5, 0xf4, 0xf4};
-  } else {
-    lut_lo = uint8x16_t{0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
-                        0x70, 0x61, 0xe1, 0xb4, 0xe5, 0xe5, 0xf4, 0xb4};
+
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16_result ret =
+      icelake::valid_utf8_to_fixed_length<endianness::BIG, char16_t>(
+          buf, len, utf16_output);
+  size_t saved_bytes = ret.second - utf16_output;
+  const char *end = buf + len;
+  if (ret.first == end) {
+    return saved_bytes;
   }
-#endif
-  uint8x16_t lo0 = vqtbl1q_u8(lut_lo, lo_nibbles0);
-  uint8x16_t lo1 = vqtbl1q_u8(lut_lo, lo_nibbles1);
-  uint8x16_t lo2 = vqtbl1q_u8(lut_lo, lo_nibbles2);
-  uint8x16_t lo3 = vqtbl1q_u8(lut_lo, lo_nibbles3);
-  uint8x16_t lut_hi;
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  if (base64_url) {
-    lut_hi =
-        simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4,
-                                0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20);
-  } else {
-    lut_hi =
-        simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4,
-                                0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20);
+
+  // Note: AVX512 procedure looks up 4 bytes forward, and
+  //       correctly converts multi-byte chars even if their
+  //       continuation bytes lie outsiede 16-byte window.
+  //       It meas, we have to skip continuation bytes from
+  //       the beginning ret.first, as they were already consumed.
+  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+    ret.first += 1;
   }
-#else
-  if (base64_url) {
-    lut_hi = uint8x16_t{0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4,
-                        0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
-  } else {
-    lut_hi = uint8x16_t{0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4,
-                        0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
+
+  if (ret.first != end) {
+    const size_t scalar_saved_bytes =
+        scalar::utf8_to_utf16::convert_valid<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
   }
-#endif
-  uint8x16_t hi0 = vqtbl1q_u8(lut_hi, hi_nibbles0);
-  uint8x16_t hi1 = vqtbl1q_u8(lut_hi, hi_nibbles1);
-  uint8x16_t hi2 = vqtbl1q_u8(lut_hi, hi_nibbles2);
-  uint8x16_t hi3 = vqtbl1q_u8(lut_hi, hi_nibbles3);
 
-  if (base64_url) {
-    hi0 = vbicq_u8(hi0, underscore0);
-    hi1 = vbicq_u8(hi1, underscore1);
-    hi2 = vbicq_u8(hi2, underscore2);
-    hi3 = vbicq_u8(hi3, underscore3);
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+    const char *buf, size_t len, char32_t *utf32_out) const noexcept {
+  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
+  utf8_to_utf32_result ret =
+      icelake::validating_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(
+          buf, len, utf32_output);
+  if (ret.second == nullptr)
+    return 0;
+
+  size_t saved_bytes = ret.second - utf32_output;
+  const char *end = buf + len;
+  if (ret.first == end) {
+    return saved_bytes;
   }
 
-  uint8_t checks =
-      vmaxvq_u8(vorrq_u8(vorrq_u8(vandq_u8(lo0, hi0), vandq_u8(lo1, hi1)),
-                         vorrq_u8(vandq_u8(lo2, hi2), vandq_u8(lo3, hi3))));
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  const uint8x16_t bit_mask =
-      simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                              0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
-#else
-  const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
-#endif
-  uint64_t badcharmask = 0;
-  *error = checks > 0x3;
-  if (checks) {
-    // Add each of the elements next to each other, successively, to stuff each
-    // 8 byte mask into one.
-    uint8x16_t test0 = vtstq_u8(lo0, hi0);
-    uint8x16_t test1 = vtstq_u8(lo1, hi1);
-    uint8x16_t test2 = vtstq_u8(lo2, hi2);
-    uint8x16_t test3 = vtstq_u8(lo3, hi3);
-    uint8x16_t sum0 =
-        vpaddq_u8(vandq_u8(test0, bit_mask), vandq_u8(test1, bit_mask));
-    uint8x16_t sum1 =
-        vpaddq_u8(vandq_u8(test2, bit_mask), vandq_u8(test3, bit_mask));
-    sum0 = vpaddq_u8(sum0, sum1);
-    sum0 = vpaddq_u8(sum0, sum0);
-    badcharmask = vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+  // Note: the AVX512 procedure looks up 4 bytes forward, and
+  //       correctly converts multi-byte chars even if their
+  //       continuation bytes lie outside 16-byte window.
+  //       It means, we have to skip continuation bytes from
+  //       the beginning ret.first, as they were already consumed.
+  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+    ret.first += 1;
   }
-  // This is the transformation step that can be done while we are waiting for
-  // sum0
-  uint8x16_t roll_lut;
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  if (base64_url) {
-    roll_lut =
-        simdutf_make_uint8x16_t(0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
-                                0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
-  } else {
-    roll_lut =
-        simdutf_make_uint8x16_t(0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
-                                0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+  if (ret.first != end) {
+    const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert(
+        ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
   }
-#else
-  if (base64_url) {
-    roll_lut = uint8x16_t{0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
-                          0x0,  0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0};
-  } else {
-    roll_lut = uint8x16_t{0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
-                          0x0, 0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0};
+
+  return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+    const char *buf, size_t len, char32_t *utf32) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return {error_code::SUCCESS, 0};
+  }
+  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32);
+  auto ret = icelake::validating_utf8_to_fixed_length_with_constant_checks<
+      endianness::LITTLE, uint32_t>(buf, len, utf32_output);
+
+  if (!std::get<2>(ret)) {
+    size_t pos = std::get<0>(ret) - buf;
+    // We might have an error that occurs right before  pos.
+    // This is only a concern if buf[pos] is not a continuation byte.
+    if ((buf[pos] & 0xc0) != 0x80 && pos >= 64) {
+      pos -= 1;
+    } else if ((buf[pos] & 0xc0) == 0x80 && pos >= 64) {
+      // We must check whether we are the fourth continuation byte
+      bool c1 = (buf[pos - 1] & 0xc0) == 0x80;
+      bool c2 = (buf[pos - 2] & 0xc0) == 0x80;
+      bool c3 = (buf[pos - 3] & 0xc0) == 0x80;
+      if (c1 && c2 && c3) {
+        return {simdutf::TOO_LONG, pos};
+      }
+    }
+    // todo: we reset the output to utf32 instead of using std::get<2.(ret) as
+    // you'd expect. that is because
+    // validating_utf8_to_fixed_length_with_constant_checks may have processed
+    // data beyond the error.
+    result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+        pos, buf + pos, len - pos, utf32);
+    res.count += pos;
+    return res;
+  }
+  size_t saved_bytes = std::get<1>(ret) - utf32_output;
+  const char *end = buf + len;
+  if (std::get<0>(ret) == end) {
+    return {simdutf::SUCCESS, saved_bytes};
+  }
+
+  // Note: the AVX512 procedure looks up 4 bytes forward, and
+  //       correctly converts multi-byte chars even if their
+  //       continuation bytes lie outside 16-byte window.
+  //       It means, we have to skip continuation bytes from
+  //       the beginning ret.first, as they were already consumed.
+  while (std::get<0>(ret) != end and
+         ((uint8_t(*std::get<0>(ret)) & 0xc0) == 0x80)) {
+    std::get<0>(ret) += 1;
+  }
+
+  if (std::get<0>(ret) != end) {
+    auto scalar_result = scalar::utf8_to_utf32::convert_with_errors(
+        std::get<0>(ret), len - (std::get<0>(ret) - buf),
+        reinterpret_cast<char32_t *>(utf32_output) + saved_bytes);
+    if (scalar_result.error != simdutf::SUCCESS) {
+      scalar_result.count += (std::get<0>(ret) - buf);
+    } else {
+      scalar_result.count += saved_bytes;
+    }
+    return scalar_result;
+  }
+
+  return {simdutf::SUCCESS, size_t(std::get<1>(ret) - utf32_output)};
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+    const char *buf, size_t len, char32_t *utf32_out) const noexcept {
+  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
+  utf8_to_utf32_result ret =
+      icelake::valid_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(
+          buf, len, utf32_output);
+  size_t saved_bytes = ret.second - utf32_output;
+  const char *end = buf + len;
+  if (ret.first == end) {
+    return saved_bytes;
+  }
+
+  // Note: AVX512 procedure looks up 4 bytes forward, and
+  //       correctly converts multi-byte chars even if their
+  //       continuation bytes lie outsiede 16-byte window.
+  //       It meas, we have to skip continuation bytes from
+  //       the beginning ret.first, as they were already consumed.
+  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+    ret.first += 1;
   }
-#endif
-  uint8x16_t vsecond_last = base64_url ? vdupq_n_u8(0x2d) : vdupq_n_u8(0x2f);
-  if (base64_url) {
-    hi_nibbles0 = vbicq_u8(hi_nibbles0, underscore0);
-    hi_nibbles1 = vbicq_u8(hi_nibbles1, underscore1);
-    hi_nibbles2 = vbicq_u8(hi_nibbles2, underscore2);
-    hi_nibbles3 = vbicq_u8(hi_nibbles3, underscore3);
+
+  if (ret.first != end) {
+    const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert_valid(
+        ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
   }
-  uint8x16_t roll0 = vqtbl1q_u8(
-      roll_lut, vaddq_u8(vceqq_u8(b->chunks[0], vsecond_last), hi_nibbles0));
-  uint8x16_t roll1 = vqtbl1q_u8(
-      roll_lut, vaddq_u8(vceqq_u8(b->chunks[1], vsecond_last), hi_nibbles1));
-  uint8x16_t roll2 = vqtbl1q_u8(
-      roll_lut, vaddq_u8(vceqq_u8(b->chunks[2], vsecond_last), hi_nibbles2));
-  uint8x16_t roll3 = vqtbl1q_u8(
-      roll_lut, vaddq_u8(vceqq_u8(b->chunks[3], vsecond_last), hi_nibbles3));
-  b->chunks[0] = vaddq_u8(b->chunks[0], roll0);
-  b->chunks[1] = vaddq_u8(b->chunks[1], roll1);
-  b->chunks[2] = vaddq_u8(b->chunks[2], roll2);
-  b->chunks[3] = vaddq_u8(b->chunks[3], roll3);
-  return badcharmask;
+
+  return saved_bytes;
 }
 
-void copy_block(block64 *b, char *output) {
-  vst1q_u8((uint8_t *)output, b->chunks[0]);
-  vst1q_u8((uint8_t *)output + 16, b->chunks[1]);
-  vst1q_u8((uint8_t *)output + 32, b->chunks[2]);
-  vst1q_u8((uint8_t *)output + 48, b->chunks[3]);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  return icelake_convert_utf16_to_latin1<endianness::LITTLE>(buf, len,
+                                                             latin1_output);
 }
 
-uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
-  uint64_t popcounts =
-      vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0);
-  uint64_t offsets = popcounts * 0x0101010101010101;
-  compress(b->chunks[0], uint16_t(mask), output);
-  compress(b->chunks[1], uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF]);
-  compress(b->chunks[2], uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF]);
-  compress(b->chunks[3], uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF]);
-  return offsets >> 56;
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  return icelake_convert_utf16_to_latin1<endianness::BIG>(buf, len,
+                                                          latin1_output);
 }
 
-// The caller of this function is responsible to ensure that there are 64 bytes
-// available from reading at src. The data is read into a block64 structure.
-void load_block(block64 *b, const char *src) {
-  b->chunks[0] = vld1q_u8(reinterpret_cast<const uint8_t *>(src));
-  b->chunks[1] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 16);
-  b->chunks[2] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 32);
-  b->chunks[3] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 48);
+simdutf_warn_unused result
+implementation::convert_utf16le_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  return icelake_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
+             buf, len, latin1_output)
+      .first;
 }
 
-// The caller of this function is responsible to ensure that there are 32 bytes
-// available from reading at data. It returns a 16-byte value, narrowing with
-// saturation the 16-bit words.
-inline uint8x16_t load_satured(const uint16_t *data) {
-  uint16x8_t in1 = vld1q_u16(data);
-  uint16x8_t in2 = vld1q_u16(data + 8);
-  return vqmovn_high_u16(vqmovn_u16(in1), in2);
+simdutf_warn_unused result
+implementation::convert_utf16be_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  return icelake_convert_utf16_to_latin1_with_errors<endianness::BIG>(
+             buf, len, latin1_output)
+      .first;
 }
 
-// The caller of this function is responsible to ensure that there are 128 bytes
-// available from reading at src. The data is read into a block64 structure.
-void load_block(block64 *b, const char16_t *src) {
-  b->chunks[0] = load_satured(reinterpret_cast<const uint16_t *>(src));
-  b->chunks[1] = load_satured(reinterpret_cast<const uint16_t *>(src) + 16);
-  b->chunks[2] = load_satured(reinterpret_cast<const uint16_t *>(src) + 32);
-  b->chunks[3] = load_satured(reinterpret_cast<const uint16_t *>(src) + 48);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  // optimization opportunity: implement custom function
+  return convert_utf16be_to_latin1(buf, len, latin1_output);
 }
 
-// decode 64 bytes and output 48 bytes
-void base64_decode_block(char *out, const char *src) {
-  uint8x16x4_t str = vld4q_u8((uint8_t *)src);
-  uint8x16x3_t outvec;
-  outvec.val[0] =
-      vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
-  outvec.val[1] =
-      vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
-  outvec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
-  vst3q_u8((uint8_t *)out, outvec);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  // optimization opportunity: implement custom function
+  return convert_utf16le_to_latin1(buf, len, latin1_output);
 }
 
-template <bool base64_url, typename char_type>
-full_result
-compress_decode_base64(char *dst, const char_type *src, size_t srclen,
-                       base64_options options,
-                       last_chunk_handling_options last_chunk_options) {
-  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
-                                        : tables::base64::to_base64_value;
-  size_t equallocation =
-      srclen; // location of the first padding character if any
-  // skip trailing spaces
-  while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
-         to_base64[uint8_t(src[srclen - 1])] == 64) {
-    srclen--;
-  }
-  size_t equalsigns = 0;
-  if (srclen > 0 && src[srclen - 1] == '=') {
-    equallocation = srclen - 1;
-    srclen--;
-    equalsigns = 1;
-    // skip trailing spaces
-    while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
-           to_base64[uint8_t(src[srclen - 1])] == 64) {
-      srclen--;
-    }
-    if (srclen > 0 && src[srclen - 1] == '=') {
-      equallocation = srclen - 1;
-      srclen--;
-      equalsigns = 2;
-    }
-  }
-  if (srclen == 0) {
-    if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation, 0};
-    }
-    return {SUCCESS, 0, 0};
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  size_t outlen;
+  size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(
+      buf, len, (unsigned char *)utf8_output, &outlen);
+  if (inlen != len) {
+    return 0;
   }
-  const char_type *const srcinit = src;
-  const char *const dstinit = dst;
-  const char_type *const srcend = src + srclen;
-
-  constexpr size_t block_size = 10;
-  char buffer[block_size * 64];
-  char *bufferptr = buffer;
-  if (srclen >= 64) {
-    const char_type *const srcend64 = src + srclen - 64;
-    while (src <= srcend64) {
-      block64 b;
-      load_block(&b, src);
-      src += 64;
-      bool error = false;
-      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
-      if (badcharmask) {
-        if (error) {
-          src -= 64;
-          while (src < srcend && scalar::base64::is_eight_byte(*src) &&
-                 to_base64[uint8_t(*src)] <= 64) {
-            src++;
-          }
-          if (src < srcend) {
-            // should never happen
-          }
-          return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
-                  size_t(dst - dstinit)};
-        }
-      }
+  return outlen;
+}
 
-      if (badcharmask != 0) {
-        // optimization opportunity: check for simple masks like those made of
-        // continuous 1s followed by continuous 0s. And masks containing a
-        // single bad character.
-        bufferptr += compress_block(&b, badcharmask, bufferptr);
-      } else {
-        // optimization opportunity: if bufferptr == buffer and mask == 0, we
-        // can avoid the call to compress_block and decode directly.
-        copy_block(&b, bufferptr);
-        bufferptr += 64;
-      }
-      if (bufferptr >= (block_size - 1) * 64 + buffer) {
-        for (size_t i = 0; i < (block_size - 1); i++) {
-          base64_decode_block(dst, buffer + i * 64);
-          dst += 48;
-        }
-        std::memcpy(buffer, buffer + (block_size - 1) * 64,
-                    64); // 64 might be too much
-        bufferptr -= (block_size - 1) * 64;
-      }
-    }
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  size_t outlen;
+  size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(
+      buf, len, (unsigned char *)utf8_output, &outlen);
+  if (inlen != len) {
+    return 0;
   }
-  char *buffer_start = buffer;
-  // Optimization note: if this is almost full, then it is worth our
-  // time, otherwise, we should just decode directly.
-  int last_block = (int)((bufferptr - buffer_start) % 64);
-  if (last_block != 0 && srcend - src + last_block >= 64) {
-    while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
-      uint8_t val = to_base64[uint8_t(*src)];
-      *bufferptr = char(val);
-      if (!scalar::base64::is_eight_byte(*src) || val > 64) {
-        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
-                size_t(dst - dstinit)};
-      }
-      bufferptr += (val <= 63);
-      src++;
-    }
+  return outlen;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  size_t outlen;
+  size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(
+      buf, len, (unsigned char *)utf8_output, &outlen);
+  if (inlen != len) {
+    result res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+        buf + inlen, len - inlen, utf8_output + outlen);
+    res.count += inlen;
+    return res;
   }
+  return {simdutf::SUCCESS, outlen};
+}
 
-  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
-    base64_decode_block(dst, buffer_start);
-    dst += 48;
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  size_t outlen;
+  size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(
+      buf, len, (unsigned char *)utf8_output, &outlen);
+  if (inlen != len) {
+    result res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+        buf + inlen, len - inlen, utf8_output + outlen);
+    res.count += inlen;
+    return res;
   }
-  if ((bufferptr - buffer_start) % 64 != 0) {
-    while (buffer_start + 4 < bufferptr) {
-      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
-                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
-                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
-                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
-                        << 8;
-      triple = scalar::utf32::swap_bytes(triple);
-      std::memcpy(dst, &triple, 4);
+  return {simdutf::SUCCESS, outlen};
+}
 
-      dst += 3;
-      buffer_start += 4;
-    }
-    if (buffer_start + 4 <= bufferptr) {
-      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
-                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
-                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
-                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
-                        << 8;
-      triple = scalar::utf32::swap_bytes(triple);
-      std::memcpy(dst, &triple, 3);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return convert_utf16le_to_utf8(buf, len, utf8_output);
+}
 
-      dst += 3;
-      buffer_start += 4;
-    }
-    // we may have 1, 2 or 3 bytes left and we need to decode them so let us
-    // backtrack
-    int leftover = int(bufferptr - buffer_start);
-    while (leftover > 0) {
-      while (to_base64[uint8_t(*(src - 1))] == 64) {
-        src--;
-      }
-      src--;
-      leftover--;
-    }
-  }
-  if (src < srcend + equalsigns) {
-    full_result r = scalar::base64::base64_tail_decode(
-        dst, src, srcend - src, equalsigns, options, last_chunk_options);
-    r.input_count += size_t(src - srcinit);
-    if (r.error == error_code::INVALID_BASE64_CHARACTER ||
-        r.error == error_code::BASE64_EXTRA_BITS) {
-      return r;
-    } else {
-      r.output_count += size_t(dst - dstinit);
-    }
-    if (last_chunk_options != stop_before_partial &&
-        r.error == error_code::SUCCESS && equalsigns > 0) {
-      // additional checks
-      if ((r.output_count % 3 == 0) ||
-          ((r.output_count % 3) + 1 + equalsigns != 4)) {
-        r.error = error_code::INVALID_BASE64_CHARACTER;
-        r.input_count = equallocation;
-      }
-    }
-    return r;
-  }
-  if (equalsigns > 0) {
-    if ((size_t(dst - dstinit) % 3 == 0) ||
-        ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
-    }
-  }
-  return {SUCCESS, srclen, size_t(dst - dstinit)};
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return convert_utf16be_to_utf8(buf, len, utf8_output);
 }
-/* end file src/arm64/arm_base64.cpp */
-/* begin file src/arm64/arm_convert_utf32_to_latin1.cpp */
-std::pair<const char32_t *, char *>
-arm_convert_utf32_to_latin1(const char32_t *buf, size_t len,
-                            char *latin1_output) {
-  const char32_t *end = buf + len;
-  while (end - buf >= 8) {
-    uint32x4_t in1 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
-    uint32x4_t in2 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
 
-    uint16x8_t utf16_packed = vcombine_u16(vqmovn_u32(in1), vqmovn_u32(in2));
-    if (vmaxvq_u16(utf16_packed) <= 0xff) {
-      // 1. pack the bytes
-      uint8x8_t latin1_packed = vmovn_u16(utf16_packed);
-      // 2. store (8 bytes)
-      vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
-      // 3. adjust pointers
-      buf += 8;
-      latin1_output += 8;
-    } else {
-      return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
-    }
-  } // while
-  return std::make_pair(buf, latin1_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  return icelake_convert_utf32_to_latin1(buf, len, latin1_output);
 }
 
-std::pair<result, char *>
-arm_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
-                                        char *latin1_output) {
-  const char32_t *start = buf;
-  const char32_t *end = buf + len;
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  return icelake_convert_utf32_to_latin1_with_errors(buf, len, latin1_output)
+      .first;
+}
 
-  while (end - buf >= 8) {
-    uint32x4_t in1 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
-    uint32x4_t in2 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  return icelake_convert_utf32_to_latin1(buf, len, latin1_output);
+}
 
-    uint16x8_t utf16_packed = vcombine_u16(vqmovn_u32(in1), vqmovn_u32(in2));
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char32_t *, char *> ret =
+      avx512_convert_utf32_to_utf8(buf, len, utf8_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-    if (vmaxvq_u16(utf16_packed) <= 0xff) {
-      // 1. pack the bytes
-      uint8x8_t latin1_packed = vmovn_u16(utf16_packed);
-      // 2. store (8 bytes)
-      vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
-      // 3. adjust pointers
-      buf += 8;
-      latin1_output += 8;
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char *> ret =
+      icelake::avx512_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+  if (ret.first.count != len) {
+    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
     } else {
-      // Let us do a scalar fallback.
-      for (int k = 0; k < 8; k++) {
-        uint32_t word = buf[k];
-        if (word <= 0xff) {
-          *latin1_output++ = char(word);
-        } else {
-          return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
-                                latin1_output);
-        }
-      }
+      ret.second += scalar_res.count;
     }
-  } // while
-  return std::make_pair(result(error_code::SUCCESS, buf - start),
-                        latin1_output);
+  }
+  ret.first.count =
+      ret.second -
+      utf8_output; // Set count to the number of 8-bit code units written
+  return ret.first;
 }
-/* end file src/arm64/arm_convert_utf32_to_latin1.cpp */
-/* begin file src/arm64/arm_convert_utf32_to_utf16.cpp */
-template <endianness big_endian>
-std::pair<const char32_t *, char16_t *>
-arm_convert_utf32_to_utf16(const char32_t *buf, size_t len,
-                           char16_t *utf16_out) {
-  uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
-  const char32_t *end = buf + len;
 
-  uint16x4_t forbidden_bytemask = vmov_n_u16(0x0);
-
-  while (end - buf >= 4) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
-
-    // Check if no bits set above 16th
-    if (vmaxvq_u32(in) <= 0xFFFF) {
-      uint16x4_t utf16_packed = vmovn_u32(in);
-
-      const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
-      const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
-      forbidden_bytemask = vorr_u16(vand_u16(vcle_u16(utf16_packed, v_dfff),
-                                             vcge_u16(utf16_packed, v_d800)),
-                                    forbidden_bytemask);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  return convert_utf32_to_utf8(buf, len, utf8_output);
+}
 
-      if (!match_system(big_endian)) {
-        utf16_packed =
-            vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(utf16_packed)));
-      }
-      vst1_u16(utf16_output, utf16_packed);
-      utf16_output += 4;
-      buf += 4;
-    } else {
-      size_t forward = 3;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint32_t word = buf[k];
-        if ((word & 0xFFFF0000) == 0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) {
-            return std::make_pair(nullptr,
-                                  reinterpret_cast<char16_t *>(utf16_output));
-          }
-          *utf16_output++ = !match_system(big_endian)
-                                ? char16_t(word >> 8 | word << 8)
-                                : char16_t(word);
-        } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) {
-            return std::make_pair(nullptr,
-                                  reinterpret_cast<char16_t *>(utf16_output));
-          }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (!match_system(big_endian)) {
-            high_surrogate =
-                uint16_t(high_surrogate >> 8 | high_surrogate << 8);
-            low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
-        }
-      }
-      buf += k;
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char32_t *, char16_t *> ret =
+      avx512_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char32_t *, char16_t *> ret =
+      avx512_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf32_to_utf16::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
+    saved_bytes += scalar_saved_bytes;
   }
+  return saved_bytes;
+}
 
-  // check for invalid input
-  if (vmaxv_u16(forbidden_bytemask) != 0) {
-    return std::make_pair(nullptr, reinterpret_cast<char16_t *>(utf16_output));
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char16_t *> ret =
+      avx512_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(
+          buf, len, utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res =
+        scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
   }
+  ret.first.count =
+      ret.second -
+      utf16_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-  return std::make_pair(buf, reinterpret_cast<char16_t *>(utf16_output));
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char16_t *> ret =
+      avx512_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len,
+                                                                 utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res =
+        scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf16_output; // Set count to the number of 8-bit code units written
+  return ret.first;
 }
 
-template <endianness big_endian>
-std::pair<result, char16_t *>
-arm_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
-                                       char16_t *utf16_out) {
-  uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
-  const char32_t *start = buf;
-  const char32_t *end = buf + len;
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return convert_utf32_to_utf16le(buf, len, utf16_output);
+}
 
-  while (end - buf >= 4) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return convert_utf32_to_utf16be(buf, len, utf16_output);
+}
 
-    // Check if no bits set above 16th
-    if (vmaxvq_u32(in) <= 0xFFFF) {
-      uint16x4_t utf16_packed = vmovn_u32(in);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::tuple<const char16_t *, char32_t *, bool> ret =
+      icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
+                                                          utf32_output);
+  if (!std::get<2>(ret)) {
+    return 0;
+  }
+  size_t saved_bytes = std::get<1>(ret) - utf32_output;
+  if (std::get<0>(ret) != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-      const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
-      const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
-      const uint16x4_t forbidden_bytemask = vand_u16(
-          vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800));
-      if (vmaxv_u16(forbidden_bytemask) != 0) {
-        return std::make_pair(result(error_code::SURROGATE, buf - start),
-                              reinterpret_cast<char16_t *>(utf16_output));
-      }
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::tuple<const char16_t *, char32_t *, bool> ret =
+      icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+  if (!std::get<2>(ret)) {
+    return 0;
+  }
+  size_t saved_bytes = std::get<1>(ret) - utf32_output;
+  if (std::get<0>(ret) != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf32::convert<endianness::BIG>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-      if (!match_system(big_endian)) {
-        utf16_packed =
-            vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(utf16_packed)));
-      }
-      vst1_u16(utf16_output, utf16_packed);
-      utf16_output += 4;
-      buf += 4;
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::tuple<const char16_t *, char32_t *, bool> ret =
+      icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
+                                                          utf32_output);
+  if (!std::get<2>(ret)) {
+    result scalar_res =
+        scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    scalar_res.count += (std::get<0>(ret) - buf);
+    return scalar_res;
+  }
+  size_t saved_bytes = std::get<1>(ret) - utf32_output;
+  if (std::get<0>(ret) != buf + len) {
+    result scalar_res =
+        scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    if (scalar_res.error) {
+      scalar_res.count += (std::get<0>(ret) - buf);
+      return scalar_res;
     } else {
-      size_t forward = 3;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint32_t word = buf[k];
-        if ((word & 0xFFFF0000) == 0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) {
-            return std::make_pair(
-                result(error_code::SURROGATE, buf - start + k),
-                reinterpret_cast<char16_t *>(utf16_output));
-          }
-          *utf16_output++ = !match_system(big_endian)
-                                ? char16_t(word >> 8 | word << 8)
-                                : char16_t(word);
-        } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) {
-            return std::make_pair(
-                result(error_code::TOO_LARGE, buf - start + k),
-                reinterpret_cast<char16_t *>(utf16_output));
-          }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (!match_system(big_endian)) {
-            high_surrogate =
-                uint16_t(high_surrogate >> 8 | high_surrogate << 8);
-            low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
-        }
-      }
-      buf += k;
+      scalar_res.count += saved_bytes;
+      return scalar_res;
     }
   }
-
-  return std::make_pair(result(error_code::SUCCESS, buf - start),
-                        reinterpret_cast<char16_t *>(utf16_output));
+  return simdutf::result(simdutf::SUCCESS, saved_bytes);
 }
-/* end file src/arm64/arm_convert_utf32_to_utf16.cpp */
-/* begin file src/arm64/arm_convert_utf32_to_utf8.cpp */
-std::pair<const char32_t *, char *>
-arm_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
-  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
-  const char32_t *end = buf + len;
-
-  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
 
-  uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0);
-  const size_t safety_margin =
-      12; // to avoid overruns, see issue
-          // https://github.com/simdutf/simdutf/issues/92
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::tuple<const char16_t *, char32_t *, bool> ret =
+      icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+  if (!std::get<2>(ret)) {
+    result scalar_res =
+        scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    scalar_res.count += (std::get<0>(ret) - buf);
+    return scalar_res;
+  }
+  size_t saved_bytes = std::get<1>(ret) - utf32_output;
+  if (std::get<0>(ret) != buf + len) {
+    result scalar_res =
+        scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    if (scalar_res.error) {
+      scalar_res.count += (std::get<0>(ret) - buf);
+      return scalar_res;
+    } else {
+      scalar_res.count += saved_bytes;
+      return scalar_res;
+    }
+  }
+  return simdutf::result(simdutf::SUCCESS, saved_bytes);
+}
 
-  while (buf + 16 + safety_margin < end) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
-    uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::tuple<const char16_t *, char32_t *, bool> ret =
+      icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
+                                                          utf32_output);
+  if (!std::get<2>(ret)) {
+    return 0;
+  }
+  size_t saved_bytes = std::get<1>(ret) - utf32_output;
+  if (std::get<0>(ret) != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-    // Check if no bits set above 16th
-    if (vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
-      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
-      // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
-      uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
-      if (vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
-        // 1. pack the bytes
-        // obviously suboptimal.
-        uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
-        // 2. store (8 bytes)
-        vst1_u8(utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 8;
-        utf8_output += 8;
-        continue; // we are done for this round!
-      }
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::tuple<const char16_t *, char32_t *, bool> ret =
+      icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+  if (!std::get<2>(ret)) {
+    return 0;
+  }
+  size_t saved_bytes = std::get<1>(ret) - utf32_output;
+  if (std::get<0>(ret) != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf32::convert<endianness::BIG>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-      if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
-        // 1. prepare 2-byte values
-        // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-        // expected output   : [110a|aaaa|10bb|bbbb] x 8
-        const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
-        const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+void implementation::change_endianness_utf16(const char16_t *input,
+                                             size_t length,
+                                             char16_t *output) const noexcept {
+  size_t pos = 0;
+  const __m512i byteflip = _mm512_setr_epi64(
+      0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
+      0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+      0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  while (pos + 32 <= length) {
+    __m512i utf16 = _mm512_loadu_si512((const __m512i *)(input + pos));
+    utf16 = _mm512_shuffle_epi8(utf16, byteflip);
+    _mm512_storeu_si512(output + pos, utf16);
+    pos += 32;
+  }
+  if (pos < length) {
+    __mmask32 m((1U << (length - pos)) - 1);
+    __m512i utf16 = _mm512_maskz_loadu_epi16(m, (const __m512i *)(input + pos));
+    utf16 = _mm512_shuffle_epi8(utf16, byteflip);
+    _mm512_mask_storeu_epi16(output + pos, m, utf16);
+  }
+}
 
-        // t0 = [000a|aaaa|bbbb|bb00]
-        const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
-        // t1 = [000a|aaaa|0000|0000]
-        const uint16x8_t t1 = vandq_u16(t0, v_1f00);
-        // t2 = [0000|0000|00bb|bbbb]
-        const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
-        // t3 = [000a|aaaa|00bb|bbbb]
-        const uint16x8_t t3 = vorrq_u16(t1, t2);
-        // t4 = [110a|aaaa|10bb|bbbb]
-        const uint16x8_t t4 = vorrq_u16(t3, v_c080);
-        // 2. merge ASCII and 2-byte codewords
-        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-        const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
-        const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(
-            vbslq_u16(one_byte_bytemask, utf16_packed, t4));
-        // 3. prepare bitmask for 8-bit lookup
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint16x8_t mask = simdutf_make_uint16x8_t(
-            0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
-#else
-        const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
-                                 0x0002, 0x0008, 0x0020, 0x0080};
-#endif
-        uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
-        // 4. pack the bytes
-        const uint8_t *row =
-            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-        const uint8x16_t shuffle = vld1q_u8(row + 1);
-        const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+simdutf_warn_unused size_t implementation::count_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  const char16_t *ptr = input;
+  size_t count{0};
 
-        // 5. store bytes
-        vst1q_u8(utf8_output, utf8_packed);
+  if (length >= 32) {
+    const char16_t *end = input + length - 32;
 
-        // 6. adjust pointers
-        buf += 8;
-        utf8_output += row[0];
-        continue;
-      } else {
-        // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
-        const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-        const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
-        forbidden_bytemask =
-            vorrq_u16(vandq_u16(vcleq_u16(utf16_packed, v_dfff),
-                                vcgeq_u16(utf16_packed, v_d800)),
-                      forbidden_bytemask);
+    const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
+    const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
 
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint16x8_t dup_even = simdutf_make_uint16x8_t(
-            0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-#else
-        const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
-                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
-#endif
-        /* In this branch we handle three cases:
-          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
-          single UFT-8 byte
-          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
-          two UTF-8 bytes
-          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
-          three UTF-8 bytes
+    while (ptr <= end) {
+      __m512i utf16 = _mm512_loadu_si512((const __m512i *)ptr);
+      ptr += 32;
+      uint64_t not_high_surrogate =
+          static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) |
+                                _mm512_cmplt_epu16_mask(utf16, low));
+      count += count_ones(not_high_surrogate);
+    }
+  }
 
-          We expand the input word (16-bit) into two code units (32-bit), thus
-          we have room for four bytes. However, we need five distinct bit
-          layouts. Note that the last byte in cases #2 and #3 is the same.
+  return count + scalar::utf16::count_code_points<endianness::LITTLE>(
+                     ptr, length - (ptr - input));
+}
 
-          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-          in register t2.
+simdutf_warn_unused size_t implementation::count_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  const char16_t *ptr = input;
+  size_t count{0};
+  if (length >= 32) {
 
-          We precompute byte 1 for case #3 and -- **conditionally** --
-          precompute either byte 1 for case #2 or byte 2 for case #3. Note that
-          they differ by exactly one bit.
+    const char16_t *end = input + length - 32;
 
-          Finally from these two code units we build proper UTF-8 sequence,
-          taking into account the case (i.e, the number of bytes to write).
-        */
-        /**
-         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-         * t2 => [0ccc|cccc] [10cc|cccc]
-         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-         */
-#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
-        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-        const uint16x8_t t0 =
-            vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed),
-                                            vreinterpretq_u8_u16(dup_even)));
-        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-        const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
-        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-        const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+    const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
+    const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
 
-        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
-        const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
-        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
-        const uint16x8_t s1 =
-            vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
-        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-        const uint16x8_t s1s = vshlq_n_u16(s1, 2);
-        // [00bb|bbbb|0000|aaaa]
-        const uint16x8_t s2 = vorrq_u16(s0, s1s);
-        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-        const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
-        const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
-        const uint16x8_t one_or_two_bytes_bytemask =
-            vcleq_u16(utf16_packed, v_07ff);
-        const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000),
-                                        one_or_two_bytes_bytemask);
-        const uint16x8_t s4 = veorq_u16(s3, m0);
-#undef simdutf_vec
+    const __m512i byteflip = _mm512_setr_epi64(
+        0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
+        0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+        0x0607040502030001, 0x0e0f0c0d0a0b0809);
+    while (ptr <= end) {
+      __m512i utf16 =
+          _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i *)ptr), byteflip);
+      ptr += 32;
+      uint64_t not_high_surrogate =
+          static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) |
+                                _mm512_cmplt_epu16_mask(utf16, low));
+      count += count_ones(not_high_surrogate);
+    }
+  }
 
-        // 4. expand code units 16-bit => 32-bit
-        const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
-        const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+  return count + scalar::utf16::count_code_points<endianness::BIG>(
+                     ptr, length - (ptr - input));
+}
 
-        // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-        const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint16x8_t onemask = simdutf_make_uint16x8_t(
-            0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
-        const uint16x8_t twomask = simdutf_make_uint16x8_t(
-            0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
-#else
-        const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
-                                    0x0100, 0x0400, 0x1000, 0x4000};
-        const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
-                                    0x0200, 0x0800, 0x2000, 0x8000};
-#endif
-        const uint16x8_t combined =
-            vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
-                      vandq_u16(one_or_two_bytes_bytemask, twomask));
-        const uint16_t mask = vaddvq_u16(combined);
-        // The following fast path may or may not be beneficial.
-        /*if(mask == 0) {
-          // We only have three-byte code units. Use fast path.
-          const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
-          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
-          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
-          vst1q_u8(utf8_output, utf8_0);
-          utf8_output += 12;
-          vst1q_u8(utf8_output, utf8_1);
-          utf8_output += 12;
-          buf += 8;
-          continue;
-        }*/
-        const uint8_t mask0 = uint8_t(mask);
-        const uint8_t *row0 =
-            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-        const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
-        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *input, size_t length) const noexcept {
+  const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
+  size_t answer =
+      length / sizeof(__m512i) *
+      sizeof(__m512i); // Number of 512-bit chunks that fits into the length.
+  size_t i = 0;
+  __m512i unrolled_popcount{0};
 
-        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-        const uint8_t *row1 =
-            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-        const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
-        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+  const __m512i continuation = _mm512_set1_epi8(char(0b10111111));
 
-        vst1q_u8(utf8_output, utf8_0);
-        utf8_output += row0[0];
-        vst1q_u8(utf8_output, utf8_1);
-        utf8_output += row1[0];
+  while (i + sizeof(__m512i) <= length) {
+    size_t iterations = (length - i) / sizeof(__m512i);
 
-        buf += 8;
-      }
-      // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
-      // will produce four UTF-8 bytes.
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint32_t word = buf[k];
-        if ((word & 0xFFFFFF80) == 0) {
-          *utf8_output++ = char(word);
-        } else if ((word & 0xFFFFF800) == 0) {
-          *utf8_output++ = char((word >> 6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if ((word & 0xFFFF0000) == 0) {
-          if (word >= 0xD800 && word <= 0xDFFF) {
-            return std::make_pair(nullptr,
-                                  reinterpret_cast<char *>(utf8_output));
-          }
-          *utf8_output++ = char((word >> 12) | 0b11100000);
-          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else {
-          if (word > 0x10FFFF) {
-            return std::make_pair(nullptr,
-                                  reinterpret_cast<char *>(utf8_output));
-          }
-          *utf8_output++ = char((word >> 18) | 0b11110000);
-          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        }
-      }
-      buf += k;
+    size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i);
+    for (; i + 8 * sizeof(__m512i) <= max_i; i += 8 * sizeof(__m512i)) {
+      __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i));
+      __m512i input2 =
+          _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i)));
+      __m512i input3 =
+          _mm512_loadu_si512((const __m512i *)(str + i + 2 * sizeof(__m512i)));
+      __m512i input4 =
+          _mm512_loadu_si512((const __m512i *)(str + i + 3 * sizeof(__m512i)));
+      __m512i input5 =
+          _mm512_loadu_si512((const __m512i *)(str + i + 4 * sizeof(__m512i)));
+      __m512i input6 =
+          _mm512_loadu_si512((const __m512i *)(str + i + 5 * sizeof(__m512i)));
+      __m512i input7 =
+          _mm512_loadu_si512((const __m512i *)(str + i + 6 * sizeof(__m512i)));
+      __m512i input8 =
+          _mm512_loadu_si512((const __m512i *)(str + i + 7 * sizeof(__m512i)));
+
+      __mmask64 mask1 = _mm512_cmple_epi8_mask(input1, continuation);
+      __mmask64 mask2 = _mm512_cmple_epi8_mask(input2, continuation);
+      __mmask64 mask3 = _mm512_cmple_epi8_mask(input3, continuation);
+      __mmask64 mask4 = _mm512_cmple_epi8_mask(input4, continuation);
+      __mmask64 mask5 = _mm512_cmple_epi8_mask(input5, continuation);
+      __mmask64 mask6 = _mm512_cmple_epi8_mask(input6, continuation);
+      __mmask64 mask7 = _mm512_cmple_epi8_mask(input7, continuation);
+      __mmask64 mask8 = _mm512_cmple_epi8_mask(input8, continuation);
+
+      __m512i mask_register = _mm512_set_epi64(mask8, mask7, mask6, mask5,
+                                               mask4, mask3, mask2, mask1);
+
+      unrolled_popcount = _mm512_add_epi64(unrolled_popcount,
+                                           _mm512_popcnt_epi64(mask_register));
     }
-  } // while
 
-  // check for invalid input
-  if (vmaxvq_u16(forbidden_bytemask) != 0) {
-    return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
+    for (; i <= max_i; i += sizeof(__m512i)) {
+      __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i));
+      uint64_t continuation_bitmask = static_cast<uint64_t>(
+          _mm512_cmple_epi8_mask(more_input, continuation));
+      answer -= count_ones(continuation_bitmask);
+    }
   }
-  return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
-}
 
-std::pair<result, char *>
-arm_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
-                                      char *utf8_out) {
-  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
-  const char32_t *start = buf;
-  const char32_t *end = buf + len;
+  __m256i first_half = _mm512_extracti64x4_epi64(unrolled_popcount, 0);
+  __m256i second_half = _mm512_extracti64x4_epi64(unrolled_popcount, 1);
+  answer -= (size_t)_mm256_extract_epi64(first_half, 0) +
+            (size_t)_mm256_extract_epi64(first_half, 1) +
+            (size_t)_mm256_extract_epi64(first_half, 2) +
+            (size_t)_mm256_extract_epi64(first_half, 3) +
+            (size_t)_mm256_extract_epi64(second_half, 0) +
+            (size_t)_mm256_extract_epi64(second_half, 1) +
+            (size_t)_mm256_extract_epi64(second_half, 2) +
+            (size_t)_mm256_extract_epi64(second_half, 3);
 
-  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
-  const size_t safety_margin =
-      12; // to avoid overruns, see issue
-          // https://github.com/simdutf/simdutf/issues/92
+  return answer + scalar::utf8::count_code_points(
+                      reinterpret_cast<const char *>(str + i), length - i);
+}
 
-  while (buf + 16 + safety_margin < end) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
-    uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
+    const char *buf, size_t len) const noexcept {
+  return count_utf8(buf, len);
+}
 
-    // Check if no bits set above 16th
-    if (vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
-      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
-      // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
-      uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
-      if (vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
-        // 1. pack the bytes
-        // obviously suboptimal.
-        uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
-        // 2. store (8 bytes)
-        vst1_u8(utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 8;
-        utf8_output += 8;
-        continue; // we are done for this round!
-      }
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf16(size_t length) const noexcept {
+  return scalar::utf16::latin1_length_from_utf16(length);
+}
 
-      if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
-        // 1. prepare 2-byte values
-        // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-        // expected output   : [110a|aaaa|10bb|bbbb] x 8
-        const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
-        const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf32(size_t length) const noexcept {
+  return scalar::utf32::latin1_length_from_utf32(length);
+}
 
-        // t0 = [000a|aaaa|bbbb|bb00]
-        const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
-        // t1 = [000a|aaaa|0000|0000]
-        const uint16x8_t t1 = vandq_u16(t0, v_1f00);
-        // t2 = [0000|0000|00bb|bbbb]
-        const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
-        // t3 = [000a|aaaa|00bb|bbbb]
-        const uint16x8_t t3 = vorrq_u16(t1, t2);
-        // t4 = [110a|aaaa|10bb|bbbb]
-        const uint16x8_t t4 = vorrq_u16(t3, v_c080);
-        // 2. merge ASCII and 2-byte codewords
-        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-        const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
-        const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(
-            vbslq_u16(one_byte_bytemask, utf16_packed, t4));
-        // 3. prepare bitmask for 8-bit lookup
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint16x8_t mask = simdutf_make_uint16x8_t(
-            0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
-#else
-        const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
-                                 0x0002, 0x0008, 0x0020, 0x0080};
-#endif
-        uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
-        // 4. pack the bytes
-        const uint8_t *row =
-            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-        const uint8x16_t shuffle = vld1q_u8(row + 1);
-        const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  const char16_t *ptr = input;
+  size_t count{0};
+  if (length >= 32) {
+    const char16_t *end = input + length - 32;
 
-        // 5. store bytes
-        vst1q_u8(utf8_output, utf8_packed);
+    const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
+    const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
+    const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
+    const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
 
-        // 6. adjust pointers
-        buf += 8;
-        utf8_output += row[0];
-        continue;
-      } else {
-        // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+    while (ptr <= end) {
+      __m512i utf16 = _mm512_loadu_si512((const __m512i *)ptr);
+      ptr += 32;
+      __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
+      __mmask32 two_bytes_bitmask =
+          _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
+      __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
+      __mmask32 surrogates_bitmask =
+          _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) &
+          _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
 
-        // check for invalid input
-        const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-        const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
-        const uint16x8_t forbidden_bytemask = vandq_u16(
-            vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800));
-        if (vmaxvq_u16(forbidden_bytemask) != 0) {
-          return std::make_pair(result(error_code::SURROGATE, buf - start),
-                                reinterpret_cast<char *>(utf8_output));
-        }
+      size_t ascii_count = count_ones(ascii_bitmask);
+      size_t two_bytes_count = count_ones(two_bytes_bitmask);
+      size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
+      size_t three_bytes_count =
+          32 - ascii_count - two_bytes_count - surrogate_bytes_count;
 
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint16x8_t dup_even = simdutf_make_uint16x8_t(
-            0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-#else
-        const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
-                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
-#endif
-        /* In this branch we handle three cases:
-          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
-          single UFT-8 byte
-          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
-          two UTF-8 bytes
-          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
-          three UTF-8 bytes
+      count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count +
+               2 * surrogate_bytes_count;
+    }
+  }
 
-          We expand the input word (16-bit) into two code units (32-bit), thus
-          we have room for four bytes. However, we need five distinct bit
-          layouts. Note that the last byte in cases #2 and #3 is the same.
+  return count + scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(
+                     ptr, length - (ptr - input));
+}
 
-          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-          in register t2.
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  const char16_t *ptr = input;
+  size_t count{0};
 
-          We precompute byte 1 for case #3 and -- **conditionally** --
-          precompute either byte 1 for case #2 or byte 2 for case #3. Note that
-          they differ by exactly one bit.
+  if (length >= 32) {
+    const char16_t *end = input + length - 32;
 
-          Finally from these two code units we build proper UTF-8 sequence,
-          taking into account the case (i.e, the number of bytes to write).
-        */
-        /**
-         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-         * t2 => [0ccc|cccc] [10cc|cccc]
-         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-         */
-#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
-        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-        const uint16x8_t t0 =
-            vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed),
-                                            vreinterpretq_u8_u16(dup_even)));
-        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-        const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
-        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-        const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+    const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
+    const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
+    const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
+    const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
 
-        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
-        const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
-        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
-        const uint16x8_t s1 =
-            vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
-        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-        const uint16x8_t s1s = vshlq_n_u16(s1, 2);
-        // [00bb|bbbb|0000|aaaa]
-        const uint16x8_t s2 = vorrq_u16(s0, s1s);
-        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-        const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
-        const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
-        const uint16x8_t one_or_two_bytes_bytemask =
-            vcleq_u16(utf16_packed, v_07ff);
-        const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000),
-                                        one_or_two_bytes_bytemask);
-        const uint16x8_t s4 = veorq_u16(s3, m0);
-#undef simdutf_vec
+    const __m512i byteflip = _mm512_setr_epi64(
+        0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
+        0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+        0x0607040502030001, 0x0e0f0c0d0a0b0809);
+    while (ptr <= end) {
+      __m512i utf16 = _mm512_loadu_si512((const __m512i *)ptr);
+      utf16 = _mm512_shuffle_epi8(utf16, byteflip);
+      ptr += 32;
+      __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
+      __mmask32 two_bytes_bitmask =
+          _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
+      __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
+      __mmask32 surrogates_bitmask =
+          _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) &
+          _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
 
-        // 4. expand code units 16-bit => 32-bit
-        const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
-        const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+      size_t ascii_count = count_ones(ascii_bitmask);
+      size_t two_bytes_count = count_ones(two_bytes_bitmask);
+      size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
+      size_t three_bytes_count =
+          32 - ascii_count - two_bytes_count - surrogate_bytes_count;
+      count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count +
+               2 * surrogate_bytes_count;
+    }
+  }
 
-        // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-        const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint16x8_t onemask = simdutf_make_uint16x8_t(
-            0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
-        const uint16x8_t twomask = simdutf_make_uint16x8_t(
-            0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
-#else
-        const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
-                                    0x0100, 0x0400, 0x1000, 0x4000};
-        const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
-                                    0x0200, 0x0800, 0x2000, 0x8000};
-#endif
-        const uint16x8_t combined =
-            vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
-                      vandq_u16(one_or_two_bytes_bytemask, twomask));
-        const uint16_t mask = vaddvq_u16(combined);
-        // The following fast path may or may not be beneficial.
-        /*if(mask == 0) {
-          // We only have three-byte code units. Use fast path.
-          const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
-          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
-          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
-          vst1q_u8(utf8_output, utf8_0);
-          utf8_output += 12;
-          vst1q_u8(utf8_output, utf8_1);
-          utf8_output += 12;
-          buf += 8;
-          continue;
-        }*/
-        const uint8_t mask0 = uint8_t(mask);
+  return count + scalar::utf16::utf8_length_from_utf16<endianness::BIG>(
+                     ptr, length - (ptr - input));
+}
 
-        const uint8_t *row0 =
-            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-        const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
-        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return implementation::count_utf16le(input, length);
+}
 
-        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-        const uint8_t *row1 =
-            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-        const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
-        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return implementation::count_utf16be(input, length);
+}
 
-        vst1q_u8(utf8_output, utf8_0);
-        utf8_output += row0[0];
-        vst1q_u8(utf8_output, utf8_1);
-        utf8_output += row1[0];
+simdutf_warn_unused size_t
+implementation::utf16_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf16_length_from_latin1(length);
+}
 
-        buf += 8;
+simdutf_warn_unused size_t
+implementation::utf32_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf32_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+    const char *input, size_t length) const noexcept {
+  const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
+  size_t answer = length / sizeof(__m512i) * sizeof(__m512i);
+  size_t i = 0;
+  if (answer >= 2048) { // long strings optimization
+    unsigned char v_0xFF = 0xff;
+    __m512i eight_64bits = _mm512_setzero_si512();
+    while (i + sizeof(__m512i) <= length) {
+      __m512i runner = _mm512_setzero_si512();
+      size_t iterations = (length - i) / sizeof(__m512i);
+      if (iterations > 255) {
+        iterations = 255;
       }
-      // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
-      // will produce four UTF-8 bytes.
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
+      size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i);
+      for (; i + 4 * sizeof(__m512i) <= max_i; i += 4 * sizeof(__m512i)) {
+        // Load four __m512i vectors
+        __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i));
+        __m512i input2 =
+            _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i)));
+        __m512i input3 = _mm512_loadu_si512(
+            (const __m512i *)(str + i + 2 * sizeof(__m512i)));
+        __m512i input4 = _mm512_loadu_si512(
+            (const __m512i *)(str + i + 3 * sizeof(__m512i)));
+
+        // Generate four masks
+        __mmask64 mask1 =
+            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input1);
+        __mmask64 mask2 =
+            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input2);
+        __mmask64 mask3 =
+            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input3);
+        __mmask64 mask4 =
+            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input4);
+        // Apply the masks and subtract from the runner
+        __m512i not_ascii1 =
+            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask1, v_0xFF);
+        __m512i not_ascii2 =
+            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask2, v_0xFF);
+        __m512i not_ascii3 =
+            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask3, v_0xFF);
+        __m512i not_ascii4 =
+            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask4, v_0xFF);
+
+        runner = _mm512_sub_epi8(runner, not_ascii1);
+        runner = _mm512_sub_epi8(runner, not_ascii2);
+        runner = _mm512_sub_epi8(runner, not_ascii3);
+        runner = _mm512_sub_epi8(runner, not_ascii4);
       }
-      for (; k < forward; k++) {
-        uint32_t word = buf[k];
-        if ((word & 0xFFFFFF80) == 0) {
-          *utf8_output++ = char(word);
-        } else if ((word & 0xFFFFF800) == 0) {
-          *utf8_output++ = char((word >> 6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if ((word & 0xFFFF0000) == 0) {
-          if (word >= 0xD800 && word <= 0xDFFF) {
-            return std::make_pair(
-                result(error_code::SURROGATE, buf - start + k),
-                reinterpret_cast<char *>(utf8_output));
-          }
-          *utf8_output++ = char((word >> 12) | 0b11100000);
-          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else {
-          if (word > 0x10FFFF) {
-            return std::make_pair(
-                result(error_code::TOO_LARGE, buf - start + k),
-                reinterpret_cast<char *>(utf8_output));
-          }
-          *utf8_output++ = char((word >> 18) | 0b11110000);
-          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        }
+
+      for (; i <= max_i; i += sizeof(__m512i)) {
+        __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i));
+
+        __mmask64 mask =
+            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), more_input);
+        __m512i not_ascii =
+            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask, v_0xFF);
+        runner = _mm512_sub_epi8(runner, not_ascii);
       }
-      buf += k;
+
+      eight_64bits = _mm512_add_epi64(
+          eight_64bits, _mm512_sad_epu8(runner, _mm512_setzero_si512()));
+    }
+
+    __m256i first_half = _mm512_extracti64x4_epi64(eight_64bits, 0);
+    __m256i second_half = _mm512_extracti64x4_epi64(eight_64bits, 1);
+    answer += (size_t)_mm256_extract_epi64(first_half, 0) +
+              (size_t)_mm256_extract_epi64(first_half, 1) +
+              (size_t)_mm256_extract_epi64(first_half, 2) +
+              (size_t)_mm256_extract_epi64(first_half, 3) +
+              (size_t)_mm256_extract_epi64(second_half, 0) +
+              (size_t)_mm256_extract_epi64(second_half, 1) +
+              (size_t)_mm256_extract_epi64(second_half, 2) +
+              (size_t)_mm256_extract_epi64(second_half, 3);
+  } else if (answer > 0) {
+    for (; i + sizeof(__m512i) <= length; i += sizeof(__m512i)) {
+      __m512i latin = _mm512_loadu_si512((const __m512i *)(str + i));
+      uint64_t non_ascii = _mm512_movepi8_mask(latin);
+      answer += count_ones(non_ascii);
     }
-  } // while
+  }
+  return answer + scalar::latin1::utf8_length_from_latin1(
+                      reinterpret_cast<const char *>(str + i), length - i);
+}
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start),
-                        reinterpret_cast<char *>(utf8_output));
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  size_t pos = 0;
+  size_t count = 0;
+  // This algorithm could no doubt be improved!
+  for (; pos + 64 <= length; pos += 64) {
+    __m512i utf8 = _mm512_loadu_si512((const __m512i *)(input + pos));
+    uint64_t utf8_continuation_mask =
+        _mm512_cmplt_epi8_mask(utf8, _mm512_set1_epi8(-65 + 1));
+    // We count one word for anything that is not a continuation (so
+    // leading bytes).
+    count += 64 - count_ones(utf8_continuation_mask);
+    uint64_t utf8_4byte =
+        _mm512_cmpge_epu8_mask(utf8, _mm512_set1_epi8(int8_t(240)));
+    count += count_ones(utf8_4byte);
+  }
+  return count +
+         scalar::utf8::utf16_length_from_utf8(input + pos, length - pos);
 }
-/* end file src/arm64/arm_convert_utf32_to_utf8.cpp */
 
-} // unnamed namespace
-} // namespace arm64
-} // namespace simdutf
-/* begin file src/generic/buf_block_reader.h */
-namespace simdutf {
-namespace arm64 {
-namespace {
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  const char32_t *ptr = input;
+  size_t count{0};
 
-// Walks through a buffer in block-sized increments, loading the last part with
-// spaces
-template <size_t STEP_SIZE> struct buf_block_reader {
-public:
-  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
-  simdutf_really_inline size_t block_index();
-  simdutf_really_inline bool has_full_block() const;
-  simdutf_really_inline const uint8_t *full_block() const;
-  /**
-   * Get the last block, padded with spaces.
-   *
-   * There will always be a last block, with at least 1 byte, unless len == 0
-   * (in which case this function fills the buffer with spaces and returns 0. In
-   * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
-   * block with STEP_SIZE bytes and no spaces for padding.
-   *
-   * @return the number of effective characters in the last block.
-   */
-  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
-  simdutf_really_inline void advance();
+  if (length >= 16) {
+    const char32_t *end = input + length - 16;
 
-private:
-  const uint8_t *buf;
-  const size_t len;
-  const size_t lenminusstep;
-  size_t idx;
-};
+    const __m512i v_0000_007f = _mm512_set1_epi32((uint32_t)0x7f);
+    const __m512i v_0000_07ff = _mm512_set1_epi32((uint32_t)0x7ff);
+    const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
 
-// Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char *format_input_text_64(const uint8_t *text) {
-  static char *buf =
-      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
-    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+    while (ptr <= end) {
+      __m512i utf32 = _mm512_loadu_si512((const __m512i *)ptr);
+      ptr += 16;
+      __mmask16 ascii_bitmask = _mm512_cmple_epu32_mask(utf32, v_0000_007f);
+      __mmask16 two_bytes_bitmask = _mm512_mask_cmple_epu32_mask(
+          _knot_mask16(ascii_bitmask), utf32, v_0000_07ff);
+      __mmask16 three_bytes_bitmask = _mm512_mask_cmple_epu32_mask(
+          _knot_mask16(_mm512_kor(ascii_bitmask, two_bytes_bitmask)), utf32,
+          v_0000_ffff);
+
+      size_t ascii_count = count_ones(ascii_bitmask);
+      size_t two_bytes_count = count_ones(two_bytes_bitmask);
+      size_t three_bytes_count = count_ones(three_bytes_bitmask);
+      size_t four_bytes_count =
+          16 - ascii_count - two_bytes_count - three_bytes_count;
+      count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count +
+               4 * four_bytes_count;
+    }
   }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
+
+  return count +
+         scalar::utf32::utf8_length_from_utf32(ptr, length - (ptr - input));
 }
 
-// Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char *format_input_text(const simd8x64<uint8_t> &in) {
-  static char *buf =
-      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  in.store(reinterpret_cast<uint8_t *>(buf));
-  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
-    if (buf[i] < ' ') {
-      buf[i] = '_';
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  const char32_t *ptr = input;
+  size_t count{0};
+
+  if (length >= 16) {
+    const char32_t *end = input + length - 16;
+
+    const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
+
+    while (ptr <= end) {
+      __m512i utf32 = _mm512_loadu_si512((const __m512i *)ptr);
+      ptr += 16;
+      __mmask16 surrogates_bitmask =
+          _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
+
+      count += 16 + count_ones(surrogates_bitmask);
     }
   }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
+
+  return count +
+         scalar::utf32::utf16_length_from_utf32(ptr, length - (ptr - input));
 }
 
-simdutf_unused static char *format_mask(uint64_t mask) {
-  static char *buf = reinterpret_cast<char *>(malloc(64 + 1));
-  for (size_t i = 0; i < 64; i++) {
-    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
-  }
-  buf[64] = '\0';
-  return buf;
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return implementation::count_utf8(input, length);
 }
 
-template <size_t STEP_SIZE>
-simdutf_really_inline
-buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
-    : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
-      idx{0} {}
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+    const char *input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
 
-template <size_t STEP_SIZE>
-simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
-  return idx;
+simdutf_warn_unused result implementation::base64_to_binary(
+    const char *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
 }
 
-template <size_t STEP_SIZE>
-simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
-  return idx < lenminusstep;
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+    const char *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
 }
 
-template <size_t STEP_SIZE>
-simdutf_really_inline const uint8_t *
-buf_block_reader<STEP_SIZE>::full_block() const {
-  return &buf[idx];
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-template <size_t STEP_SIZE>
-simdutf_really_inline size_t
-buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
-  if (len == idx) {
-    return 0;
-  } // memcpy(dst, null, 0) will trigger an error with some sanitizers
-  std::memset(dst, 0x20,
-              STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
-                          // to write out 8 or 16 bytes at once.
-  std::memcpy(dst, buf + idx, len - idx);
-  return len - idx;
+simdutf_warn_unused result implementation::base64_to_binary(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
 }
 
-template <size_t STEP_SIZE>
-simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
-  idx += STEP_SIZE;
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
 }
 
-} // unnamed namespace
-} // namespace arm64
+simdutf_warn_unused size_t implementation::base64_length_from_binary(
+    size_t length, base64_options options) const noexcept {
+  return scalar::base64::base64_length_from_binary(length, options);
+}
+
+size_t implementation::binary_to_base64(const char *input, size_t length,
+                                        char *output,
+                                        base64_options options) const noexcept {
+  if (options & base64_url) {
+    return encode_base64<true>(output, input, length, options);
+  } else {
+    return encode_base64<false>(output, input, length, options);
+  }
+}
+
+} // namespace icelake
 } // namespace simdutf
-/* end file src/generic/buf_block_reader.h */
-/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
-namespace simdutf {
-namespace arm64 {
-namespace {
-namespace utf8_validation {
 
-using namespace simd;
+/* begin file src/simdutf/icelake/end.h */
+#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
+// nothing needed.
+#else
+SIMDUTF_UNTARGET_REGION
+#endif
 
-simdutf_really_inline simd8<uint8_t>
-check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-  // Bit 1 = Too Long (ASCII followed by continuation)
-  // Bit 2 = Overlong 3-byte
-  // Bit 4 = Surrogate
-  // Bit 5 = Overlong 2-byte
-  // Bit 7 = Two Continuations
-  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
-                                               // 11______ 11______
-  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
-  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
-  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
-  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
-                                               // 11110100 101_____
-                                               // 11110101 1001____
-                                               // 11110101 101_____
-                                               // 1111011_ 1001____
-                                               // 1111011_ 101_____
-                                               // 11111___ 1001____
-                                               // 11111___ 101_____
-  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-  // 11110101 1000____
-  // 1111011_ 1000____
-  // 11111___ 1000____
-  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
-  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
-  constexpr const uint8_t CARRY =
-      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
-  const simd8<uint8_t> byte_1_low =
-      (prev1 & 0x0F)
-          .lookup_16<uint8_t>(
-              // ____0000 ________
-              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-              // ____0001 ________
-              CARRY | OVERLONG_2,
-              // ____001_ ________
-              CARRY, CARRY,
+#if SIMDUTF_GCC11ORMORE // workaround for
+                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
+SIMDUTF_POP_DISABLE_WARNINGS
+#endif // end of workaround
+/* end file src/simdutf/icelake/end.h */
+/* end file src/icelake/implementation.cpp */
+#endif
+#if SIMDUTF_IMPLEMENTATION_HASWELL
+/* begin file src/haswell/implementation.cpp */
 
-              // ____0100 ________
-              CARRY | TOO_LARGE,
-              // ____0101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____011_ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
+/* begin file src/simdutf/haswell/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "haswell"
+// #define SIMDUTF_IMPLEMENTATION haswell
 
-              // ____1___ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____1101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000);
-  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT,
+#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
+// nothing needed.
+#else
+SIMDUTF_TARGET_HASWELL
+#endif
 
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
-          OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+#if SIMDUTF_GCC11ORMORE // workaround for
+                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
+// clang-format off
+SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
+// clang-format on
+#endif // end of workaround
+/* end file src/simdutf/haswell/begin.h */
+namespace simdutf {
+namespace haswell {
+namespace {
+#ifndef SIMDUTF_HASWELL_H
+  #error "haswell.h must be included"
+#endif
+using namespace simd;
 
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
-  return (byte_1_high & byte_1_low & byte_2_high);
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
+  return input.reduce_or().is_ascii();
 }
-simdutf_really_inline simd8<uint8_t>
-check_multibyte_lengths(const simd8<uint8_t> input,
-                        const simd8<uint8_t> prev_input,
-                        const simd8<uint8_t> sc) {
-  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
-  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
-  simd8<uint8_t> must23 =
-      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
-  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
-  return must23_80 ^ sc;
+
+simdutf_unused simdutf_really_inline simd8<bool>
+must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2,
+                     const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_second_byte =
+      prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
+  simd8<uint8_t> is_third_byte =
+      prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte =
+      prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction
+  // will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) >
+         int8_t(0);
 }
 
-//
-// Return nonzero if there are incomplete multibyte characters at the end of the
-// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
-//
-simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
-  // If the previous input's last 3 bytes match this, they're too short (they
-  // ended at EOF):
-  // ... 1111____ 111_____ 11______
-  static const uint8_t max_array[32] = {255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        0b11110000u - 1,
-                                        0b11100000u - 1,
-                                        0b11000000u - 1};
-  const simd8<uint8_t> max_value(
-      &max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
-  return input.gt_bits(max_value);
+simdutf_really_inline simd8<bool>
+must_be_2_3_continuation(const simd8<uint8_t> prev2,
+                         const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_third_byte =
+      prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be > 0x80
+  simd8<uint8_t> is_fourth_byte =
+      prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be > 0x80
+  return simd8<bool>(is_third_byte | is_fourth_byte);
 }
 
-struct utf8_checker {
-  // If this is nonzero, there has been a UTF-8 error.
-  simd8<uint8_t> error;
-  // The last input we received
-  simd8<uint8_t> prev_input_block;
-  // Whether the last input we received was incomplete (used for ASCII fast
-  // path)
-  simd8<uint8_t> prev_incomplete;
+/* begin file src/haswell/avx2_validate_utf16.cpp */
+/*
+    In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning.
+
+    In a vectorized algorithm we want to examine the most significant
+    nibble in order to select a fast path. If none of highest nibbles
+    are 0xD (13), than we are sure that UTF-16 chunk in a vector
+    register is valid.
+
+    Let us analyze what we need to check if the nibble is 0xD. The
+    value of the preceding nibble determines what we have:
+
+    0xd000 .. 0xd7ff - a valid word
+    0xd800 .. 0xdbff - low surrogate
+    0xdc00 .. 0xdfff - high surrogate
+
+    Other constraints we have to consider:
+    - there must not be two consecutive low surrogates (0xd800 .. 0xdbff)
+    - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
+    - there must not be sole low surrogate nor high surrogate
+
+    We're going to build three bitmasks based on the 3rd nibble:
+    - V = valid word,
+    - L = low surrogate (0xd800 .. 0xdbff)
+    - H = high surrogate (0xdc00 .. 0xdfff)
+
+      0   1   2   3   4   5   6   7    <--- word index
+    [ V | L | H | L | H | V | V | L ]
+      1   0   0   0   0   1   1   0     - V = valid masks
+      0   1   0   1   0   0   0   1     - L = low surrogate
+      0   0   1   0   1   0   0   0     - H high surrogate
+
+
+      1   0   0   0   0   1   1   0   V = valid masks
+      0   1   0   1   0   0   0   0   a = L & (H >> 1)
+      0   0   1   0   1   0   0   0   b = a << 1
+      1   1   1   1   1   1   1   0   c = V | a | b
+                                  ^
+                                  the last bit can be zero, we just consume 7
+   code units and recheck this word in the next iteration
+*/
+
+/* Returns:
+   - pointer to the last unprocessed character (a scalar fallback should check
+   the rest);
+   - nullptr if an error was detected.
+*/
+template <endianness big_endian>
+const char16_t *avx2_validate_utf16(const char16_t *input, size_t size) {
+  const char16_t *end = input + size;
+
+  const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+  const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+  const auto v_fc = simd8<uint8_t>::splat(0xfc);
+  const auto v_dc = simd8<uint8_t>::splat(0xdc);
+
+  while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
+    // 0. Load data: since the validation takes into account only higher
+    //    byte of each word, we compress the two vectors into one which
+    //    consists only the higher bytes.
+    auto in0 = simd16<uint16_t>(input);
+    auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
+
+    if (big_endian) {
+      in0 = in0.swap_bytes();
+      in1 = in1.swap_bytes();
+    }
+
+    const auto t0 = in0.shr<8>();
+    const auto t1 = in1.shr<8>();
+
+    const auto in = simd16<uint16_t>::pack(t0, t1);
+
+    // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+    const auto surrogates_wordmask = (in & v_f8) == v_d8;
+    const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
+    if (surrogates_bitmask == 0x0) {
+      input += simd16<uint16_t>::ELEMENTS * 2;
+    } else {
+      // 2. We have some surrogates that have to be distinguished:
+      //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+      //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+      //
+      //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+      // V - non-surrogate code units
+      //     V = not surrogates_wordmask
+      const uint32_t V = ~surrogates_bitmask;
 
-  //
-  // Check whether the current bytes are valid UTF-8.
-  //
-  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
-                                              const simd8<uint8_t> prev_input) {
-    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
-    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
-    // small negative numbers)
-    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-    simd8<uint8_t> sc = check_special_cases(input, prev1);
-    this->error |= check_multibyte_lengths(input, prev_input, sc);
-  }
+      // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+      const auto vH = (in & v_fc) == v_dc;
+      const uint32_t H = vH.to_bitmask();
 
-  // The only problem that can happen at EOF is that a multibyte character is
-  // too short or a byte value too large in the last bytes: check_special_cases
-  // only checks for bytes too large in the first of two bytes.
-  simdutf_really_inline void check_eof() {
-    // If the previous block had incomplete UTF-8 characters at the end, an
-    // ASCII block can't possibly finish them.
-    this->error |= this->prev_incomplete;
-  }
+      // L - word mask for low surrogates
+      //     L = not H and surrogates_wordmask
+      const uint32_t L = ~H & surrogates_bitmask;
 
-  simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
-    if (simdutf_likely(is_ascii(input))) {
-      this->error |= this->prev_incomplete;
-    } else {
-      // you might think that a for-loop would work, but under Visual Studio, it
-      // is not good enough.
-      static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                        (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                    "We support either two or four chunks per 64-byte block.");
-      if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-      } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-        this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+      const uint32_t a =
+          L & (H >> 1); // A low surrogate must be followed by high one.
+                        // (A low surrogate placed in the 7th register's word
+                        // is an exception we handle.)
+      const uint32_t b =
+          a << 1; // Just mark that the opposite fact is hold,
+                  // thanks to that we have only two masks for valid case.
+      const uint32_t c = V | a | b; // Combine all the masks into the final one.
+
+      if (c == 0xffffffff) {
+        // The whole input register contains valid UTF-16, i.e.,
+        // either single code units or proper surrogate pairs.
+        input += simd16<uint16_t>::ELEMENTS * 2;
+      } else if (c == 0x7fffffff) {
+        // The 31 lower code units of the input register contains valid UTF-16.
+        // The 31 word may be either a low or high surrogate. It the next
+        // iteration we 1) check if the low surrogate is followed by a high
+        // one, 2) reject sole high surrogate.
+        input += simd16<uint16_t>::ELEMENTS * 2 - 1;
+      } else {
+        return nullptr;
       }
-      this->prev_incomplete =
-          is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
-      this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
     }
   }
 
-  // do not forget to call check_eof!
-  simdutf_really_inline bool errors() const {
-    return this->error.any_bits_set_anywhere();
+  return input;
+}
+
+template <endianness big_endian>
+const result avx2_validate_utf16_with_errors(const char16_t *input,
+                                             size_t size) {
+  if (simdutf_unlikely(size == 0)) {
+    return result(error_code::SUCCESS, 0);
   }
+  const char16_t *start = input;
+  const char16_t *end = input + size;
 
-}; // struct utf8_checker
-} // namespace utf8_validation
+  const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+  const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+  const auto v_fc = simd8<uint8_t>::splat(0xfc);
+  const auto v_dc = simd8<uint8_t>::splat(0xdc);
 
-using utf8_validation::utf8_checker;
+  while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
+    // 0. Load data: since the validation takes into account only higher
+    //    byte of each word, we compress the two vectors into one which
+    //    consists only the higher bytes.
+    auto in0 = simd16<uint16_t>(input);
+    auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
 
-} // unnamed namespace
-} // namespace arm64
-} // namespace simdutf
-/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
-/* begin file src/generic/utf8_validation/utf8_validator.h */
-namespace simdutf {
-namespace arm64 {
-namespace {
-namespace utf8_validation {
+    if (big_endian) {
+      in0 = in0.swap_bytes();
+      in1 = in1.swap_bytes();
+    }
 
-/**
- * Validates that the string is actual UTF-8.
- */
-template <class checker>
-bool generic_validate_utf8(const uint8_t *input, size_t length) {
-  checker c{};
-  buf_block_reader<64> reader(input, length);
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
-    c.check_next_input(in);
-    reader.advance();
-  }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  c.check_next_input(in);
-  reader.advance();
-  c.check_eof();
-  return !c.errors();
-}
+    const auto t0 = in0.shr<8>();
+    const auto t1 = in1.shr<8>();
 
-bool generic_validate_utf8(const char *input, size_t length) {
-  return generic_validate_utf8<utf8_checker>(
-      reinterpret_cast<const uint8_t *>(input), length);
-}
+    const auto in = simd16<uint16_t>::pack(t0, t1);
 
-/**
- * Validates that the string is actual UTF-8 and stops on errors.
- */
-template <class checker>
-result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
-  checker c{};
-  buf_block_reader<64> reader(input, length);
-  size_t count{0};
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
-    c.check_next_input(in);
-    if (c.errors()) {
-      if (count != 0) {
-        count--;
-      } // Sometimes the error is only detected in the next chunk
-      result res = scalar::utf8::rewind_and_validate_with_errors(
-          reinterpret_cast<const char *>(input),
-          reinterpret_cast<const char *>(input + count), length - count);
-      res.count += count;
-      return res;
+    // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+    const auto surrogates_wordmask = (in & v_f8) == v_d8;
+    const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
+    if (surrogates_bitmask == 0x0) {
+      input += simd16<uint16_t>::ELEMENTS * 2;
+    } else {
+      // 2. We have some surrogates that have to be distinguished:
+      //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+      //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+      //
+      //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+      // V - non-surrogate code units
+      //     V = not surrogates_wordmask
+      const uint32_t V = ~surrogates_bitmask;
+
+      // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+      const auto vH = (in & v_fc) == v_dc;
+      const uint32_t H = vH.to_bitmask();
+
+      // L - word mask for low surrogates
+      //     L = not H and surrogates_wordmask
+      const uint32_t L = ~H & surrogates_bitmask;
+
+      const uint32_t a =
+          L & (H >> 1); // A low surrogate must be followed by high one.
+                        // (A low surrogate placed in the 7th register's word
+                        // is an exception we handle.)
+      const uint32_t b =
+          a << 1; // Just mark that the opposite fact is hold,
+                  // thanks to that we have only two masks for valid case.
+      const uint32_t c = V | a | b; // Combine all the masks into the final one.
+
+      if (c == 0xffffffff) {
+        // The whole input register contains valid UTF-16, i.e.,
+        // either single code units or proper surrogate pairs.
+        input += simd16<uint16_t>::ELEMENTS * 2;
+      } else if (c == 0x7fffffff) {
+        // The 31 lower code units of the input register contains valid UTF-16.
+        // The 31 word may be either a low or high surrogate. It the next
+        // iteration we 1) check if the low surrogate is followed by a high
+        // one, 2) reject sole high surrogate.
+        input += simd16<uint16_t>::ELEMENTS * 2 - 1;
+      } else {
+        return result(error_code::SURROGATE, input - start);
+      }
     }
-    reader.advance();
-    count += 64;
-  }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  c.check_next_input(in);
-  reader.advance();
-  c.check_eof();
-  if (c.errors()) {
-    if (count != 0) {
-      count--;
-    } // Sometimes the error is only detected in the next chunk
-    result res = scalar::utf8::rewind_and_validate_with_errors(
-        reinterpret_cast<const char *>(input),
-        reinterpret_cast<const char *>(input) + count, length - count);
-    res.count += count;
-    return res;
-  } else {
-    return result(error_code::SUCCESS, length);
   }
-}
 
-result generic_validate_utf8_with_errors(const char *input, size_t length) {
-  return generic_validate_utf8_with_errors<utf8_checker>(
-      reinterpret_cast<const uint8_t *>(input), length);
+  return result(error_code::SUCCESS, input - start);
 }
+/* end file src/haswell/avx2_validate_utf16.cpp */
+/* begin file src/haswell/avx2_validate_utf32le.cpp */
+/* Returns:
+   - pointer to the last unprocessed character (a scalar fallback should check
+   the rest);
+   - nullptr if an error was detected.
+*/
+const char32_t *avx2_validate_utf32le(const char32_t *input, size_t size) {
+  const char32_t *end = input + size;
 
-template <class checker>
-bool generic_validate_ascii(const uint8_t *input, size_t length) {
-  buf_block_reader<64> reader(input, length);
-  uint8_t blocks[64]{};
-  simd::simd8x64<uint8_t> running_or(blocks);
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
-    running_or |= in;
-    reader.advance();
+  const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
+  const __m256i offset = _mm256_set1_epi32(0xffff2000);
+  const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
+  __m256i currentmax = _mm256_setzero_si256();
+  __m256i currentoffsetmax = _mm256_setzero_si256();
+
+  while (input + 8 < end) {
+    const __m256i in = _mm256_loadu_si256((__m256i *)input);
+    currentmax = _mm256_max_epu32(in, currentmax);
+    currentoffsetmax =
+        _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
+    input += 8;
+  }
+  __m256i is_zero =
+      _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
+  if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+    return nullptr;
   }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  running_or |= in;
-  return running_or.is_ascii();
-}
 
-bool generic_validate_ascii(const char *input, size_t length) {
-  return generic_validate_ascii<utf8_checker>(
-      reinterpret_cast<const uint8_t *>(input), length);
+  is_zero = _mm256_xor_si256(
+      _mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
+  if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+    return nullptr;
+  }
+
+  return input;
 }
 
-template <class checker>
-result generic_validate_ascii_with_errors(const uint8_t *input, size_t length) {
-  buf_block_reader<64> reader(input, length);
-  size_t count{0};
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
-    if (!in.is_ascii()) {
-      result res = scalar::ascii::validate_with_errors(
-          reinterpret_cast<const char *>(input + count), length - count);
-      return result(res.error, count + res.count);
+const result avx2_validate_utf32le_with_errors(const char32_t *input,
+                                               size_t size) {
+  const char32_t *start = input;
+  const char32_t *end = input + size;
+
+  const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
+  const __m256i offset = _mm256_set1_epi32(0xffff2000);
+  const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
+  __m256i currentmax = _mm256_setzero_si256();
+  __m256i currentoffsetmax = _mm256_setzero_si256();
+
+  while (input + 8 < end) {
+    const __m256i in = _mm256_loadu_si256((__m256i *)input);
+    currentmax = _mm256_max_epu32(in, currentmax);
+    currentoffsetmax =
+        _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
+
+    __m256i is_zero = _mm256_xor_si256(
+        _mm256_max_epu32(currentmax, standardmax), standardmax);
+    if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+      return result(error_code::TOO_LARGE, input - start);
     }
-    reader.advance();
 
-    count += 64;
-  }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  if (!in.is_ascii()) {
-    result res = scalar::ascii::validate_with_errors(
-        reinterpret_cast<const char *>(input + count), length - count);
-    return result(res.error, count + res.count);
-  } else {
-    return result(error_code::SUCCESS, length);
+    is_zero =
+        _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax),
+                         standardoffsetmax);
+    if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+      return result(error_code::SURROGATE, input - start);
+    }
+    input += 8;
   }
+
+  return result(error_code::SUCCESS, input - start);
 }
+/* end file src/haswell/avx2_validate_utf32le.cpp */
 
-result generic_validate_ascii_with_errors(const char *input, size_t length) {
-  return generic_validate_ascii_with_errors<utf8_checker>(
-      reinterpret_cast<const uint8_t *>(input), length);
+/* begin file src/haswell/avx2_convert_latin1_to_utf8.cpp */
+std::pair<const char *, char *>
+avx2_convert_latin1_to_utf8(const char *latin1_input, size_t len,
+                            char *utf8_output) {
+  const char *end = latin1_input + len;
+  const __m256i v_0000 = _mm256_setzero_si256();
+  const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+  const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
+  const size_t safety_margin = 12;
+
+  while (end - latin1_input >= std::ptrdiff_t(16 + safety_margin)) {
+    __m128i in8 = _mm_loadu_si128((__m128i *)latin1_input);
+    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+    const __m128i v_80 = _mm_set1_epi8((char)0x80);
+    if (_mm_testz_si128(in8, v_80)) { // ASCII fast path!!!!
+      // 1. store (16 bytes)
+      _mm_storeu_si128((__m128i *)utf8_output, in8);
+      // 2. adjust pointers
+      latin1_input += 16;
+      utf8_output += 16;
+      continue; // we are done for this round!
+    }
+    // We proceed only with the first 16 bytes.
+    const __m256i in = _mm256_cvtepu8_epi16((in8));
+
+    // 1. prepare 2-byte values
+    // input 16-bit word : [0000|0000|aabb|bbbb] x 8
+    // expected output   : [1100|00aa|10bb|bbbb] x 8
+    const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+    const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+    // t0 = [0000|00aa|bbbb|bb00]
+    const __m256i t0 = _mm256_slli_epi16(in, 2);
+    // t1 = [0000|00aa|0000|0000]
+    const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+    // t2 = [0000|0000|00bb|bbbb]
+    const __m256i t2 = _mm256_and_si256(in, v_003f);
+    // t3 = [000a|aaaa|00bb|bbbb]
+    const __m256i t3 = _mm256_or_si256(t1, t2);
+    // t4 = [1100|00aa|10bb|bbbb]
+    const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+    // 2. merge ASCII and 2-byte codewords
+
+    // no bits set above 7th bit
+    const __m256i one_byte_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
+    const uint32_t one_byte_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+    const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
+
+    // 3. prepare bitmask for 8-bit lookup
+    const uint32_t M0 = one_byte_bitmask & 0x55555555;
+    const uint32_t M1 = M0 >> 7;
+    const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+    // 4. pack the bytes
+
+    const uint8_t *row =
+        &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+    const uint8_t *row_2 =
+        &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)]
+                                                            [0];
+
+    const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+    const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
+
+    const __m256i utf8_packed = _mm256_shuffle_epi8(
+        utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+    // 5. store bytes
+    _mm_storeu_si128((__m128i *)utf8_output,
+                     _mm256_castsi256_si128(utf8_packed));
+    utf8_output += row[0];
+    _mm_storeu_si128((__m128i *)utf8_output,
+                     _mm256_extractf128_si256(utf8_packed, 1));
+    utf8_output += row_2[0];
+
+    // 6. adjust pointers
+    latin1_input += 16;
+    continue;
+
+  } // while
+  return std::make_pair(latin1_input, utf8_output);
 }
+/* end file src/haswell/avx2_convert_latin1_to_utf8.cpp */
+/* begin file src/haswell/avx2_convert_latin1_to_utf16.cpp */
+template <endianness big_endian>
+std::pair<const char *, char16_t *>
+avx2_convert_latin1_to_utf16(const char *latin1_input, size_t len,
+                             char16_t *utf16_output) {
+  size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 32
 
-} // namespace utf8_validation
-} // unnamed namespace
-} // namespace arm64
-} // namespace simdutf
-/* end file src/generic/utf8_validation/utf8_validator.h */
-// transcoding from UTF-8 to UTF-16
-/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+  size_t i = 0;
+  for (; i < rounded_len; i += 16) {
+    // Load 16 bytes from the address (input + i) into a xmm register
+    __m128i xmm0 =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(latin1_input + i));
 
-namespace simdutf {
-namespace arm64 {
-namespace {
-namespace utf8_to_utf16 {
-using namespace simd;
+    // Zero extend each byte in xmm0 to word and put it in another xmm register
+    __m128i xmm1 = _mm_cvtepu8_epi16(xmm0);
 
-simdutf_really_inline simd8<uint8_t>
-check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-  // Bit 1 = Too Long (ASCII followed by continuation)
-  // Bit 2 = Overlong 3-byte
-  // Bit 4 = Surrogate
-  // Bit 5 = Overlong 2-byte
-  // Bit 7 = Two Continuations
-  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
-                                               // 11______ 11______
-  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
-  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
-  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
-  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
-                                               // 11110100 101_____
-                                               // 11110101 1001____
-                                               // 11110101 101_____
-                                               // 1111011_ 1001____
-                                               // 1111011_ 101_____
-                                               // 11111___ 1001____
-                                               // 11111___ 101_____
-  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-  // 11110101 1000____
-  // 1111011_ 1000____
-  // 11111___ 1000____
-  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+    // Shift xmm0 to the right by 8 bytes
+    xmm0 = _mm_srli_si128(xmm0, 8);
 
-  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
-  constexpr const uint8_t CARRY =
-      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
-  const simd8<uint8_t> byte_1_low =
-      (prev1 & 0x0F)
-          .lookup_16<uint8_t>(
-              // ____0000 ________
-              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-              // ____0001 ________
-              CARRY | OVERLONG_2,
-              // ____001_ ________
-              CARRY, CARRY,
+    // Zero extend each byte in the shifted xmm0 to word in xmm0
+    xmm0 = _mm_cvtepu8_epi16(xmm0);
 
-              // ____0100 ________
-              CARRY | TOO_LARGE,
-              // ____0101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____011_ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
+    if (big_endian) {
+      const __m128i swap =
+          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+      xmm0 = _mm_shuffle_epi8(xmm0, swap);
+      xmm1 = _mm_shuffle_epi8(xmm1, swap);
+    }
 
-              // ____1___ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____1101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000);
-  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT,
+    // Store the contents of xmm1 into the address pointed by (output + i)
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + i), xmm1);
 
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
-          OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+    // Store the contents of xmm0 into the address pointed by (output + i + 8)
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + i + 8), xmm0);
+  }
 
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
-  return (byte_1_high & byte_1_low & byte_2_high);
+  return std::make_pair(latin1_input + rounded_len, utf16_output + rounded_len);
 }
-simdutf_really_inline simd8<uint8_t>
-check_multibyte_lengths(const simd8<uint8_t> input,
-                        const simd8<uint8_t> prev_input,
-                        const simd8<uint8_t> sc) {
-  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
-  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
-  simd8<uint8_t> must23 =
-      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
-  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
-  return must23_80 ^ sc;
+/* end file src/haswell/avx2_convert_latin1_to_utf16.cpp */
+/* begin file src/haswell/avx2_convert_latin1_to_utf32.cpp */
+std::pair<const char *, char32_t *>
+avx2_convert_latin1_to_utf32(const char *buf, size_t len,
+                             char32_t *utf32_output) {
+  size_t rounded_len = ((len | 7) ^ 7); // Round down to nearest multiple of 8
+
+  for (size_t i = 0; i < rounded_len; i += 8) {
+    // Load 8 Latin1 characters into a 64-bit register
+    __m128i in = _mm_loadl_epi64((__m128i *)&buf[i]);
+
+    // Zero extend each set of 8 Latin1 characters to 8 32-bit integers using
+    // vpmovzxbd
+    __m256i out = _mm256_cvtepu8_epi32(in);
+
+    // Store the results back to memory
+    _mm256_storeu_si256((__m256i *)&utf32_output[i], out);
+  }
+
+  // return pointers pointing to where we left off
+  return std::make_pair(buf + rounded_len, utf32_output + rounded_len);
 }
+/* end file src/haswell/avx2_convert_latin1_to_utf32.cpp */
 
-struct validating_transcoder {
-  // If this is nonzero, there has been a UTF-8 error.
-  simd8<uint8_t> error;
+/* begin file src/haswell/avx2_convert_utf8_to_utf16.cpp */
+// depends on "tables/utf8_to_utf16_tables.h"
 
-  validating_transcoder() : error(uint8_t(0)) {}
+// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+template <endianness big_endian>
+size_t convert_masked_utf8_to_utf16(const char *input,
+                                    uint64_t utf8_end_of_code_point_mask,
+                                    char16_t *&utf16_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
   //
-  // Check whether the current bytes are valid UTF-8.
   //
-  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
-                                              const simd8<uint8_t> prev_input) {
-    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
-    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
-    // small negative numbers)
-    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-    simd8<uint8_t> sc = check_special_cases(input, prev1);
-    this->error |= check_multibyte_lengths(input, prev_input, sc);
+  // Optimization note: our main path below is load-latency dependent. Thus it
+  // is maybe beneficial to have fast paths that depend on branch prediction but
+  // have less latency. This results in more instructions but, potentially, also
+  // higher speeds.
+  //
+  // We first try a few fast paths.
+  const __m128i swap =
+      _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+  const __m128i in = _mm_loadu_si128((__m128i *)input);
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xfff;
+  if (utf8_end_of_code_point_mask == 0xfff) {
+    // We process the data in chunks of 12 bytes.
+    __m256i ascii = _mm256_cvtepu8_epi16(in);
+    if (big_endian) {
+      const __m256i swap256 = _mm256_setr_epi8(
+          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+      ascii = _mm256_shuffle_epi8(ascii, swap256);
+    }
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output), ascii);
+    utf16_output += 12; // We wrote 12 16-bit characters.
+    return 12;          // We consumed 12 bytes.
+  }
+  if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
+    // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte
+    // UTF-16 code units. There is probably a more efficient sequence, but the
+    // following might do.
+    const __m128i sh =
+        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+    __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+    if (big_endian)
+      composed = _mm_shuffle_epi8(composed, swap);
+    _mm_storeu_si128((__m128i *)utf16_output, composed);
+    utf16_output += 8; // We wrote 16 bytes, 8 code points.
+    return 16;
+  }
+  if (input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
+    // UTF-16 code units. There is probably a more efficient sequence, but the
+    // following might do.
+    const __m128i sh =
+        _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii =
+        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+    const __m128i middlebyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    const __m128i highbyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+    __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+    if (big_endian)
+      composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
+    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
+    utf16_output += 4;
+    return 12;
   }
 
-  template <endianness endian>
-  simdutf_really_inline size_t convert(const char *in, size_t size,
-                                       char16_t *utf16_output) {
-    size_t pos = 0;
-    char16_t *start{utf16_output};
-    // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
-    // last 16 bytes, and if the data is valid, then it is entirely safe because
-    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
-    // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 8 leading bytes, to give us a good margin.
-    size_t leading_byte = 0;
-    size_t margin = size;
-    for (; margin > 0 && leading_byte < 8; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) > -65);
+  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][1];
+  if (idx < 64) {
+    // SIX (6) input code-code units
+    // this is a relatively easy scenario
+    // we process SIX (6) input code-code units. The max length in bytes of six
+    // code code units spanning between 1 and 2 bytes each is 12 bytes. On
+    // processors where pdep/pext is fast, we might be able to use a small
+    // lookup table.
+    const __m128i sh = _mm_loadu_si128(
+        (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+    __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+    if (big_endian)
+      composed = _mm_shuffle_epi8(composed, swap);
+    _mm_storeu_si128((__m128i *)utf16_output, composed);
+    utf16_output += 6; // We wrote 12 bytes, 6 code points. There is a potential
+                       // overflow of 4 bytes.
+  } else if (idx < 145) {
+    // FOUR (4) input code-code units
+    const __m128i sh = _mm_loadu_si128(
+        (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii =
+        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+    const __m128i middlebyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    const __m128i highbyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+    __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+    if (big_endian)
+      composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
+    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
+    utf16_output += 4; // Here we overflow by 8 bytes.
+  } else if (idx < 209) {
+    // TWO (2) input code-code units
+    //////////////
+    // There might be garbage inputs where a leading byte mascarades as a
+    // four-byte leading byte (by being followed by 3 continuation byte), but is
+    // not greater than 0xf0. This could trigger a buffer overflow if we only
+    // counted leading bytes of the form 0xf0 as generating surrogate pairs,
+    // without further UTF-8 validation. Thus we must be careful to ensure that
+    // only leading bytes at least as large as 0xf0 generate surrogate pairs. We
+    // do as at the cost of an extra mask.
+    /////////////
+    const __m128i sh = _mm_loadu_si128(
+        (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+    // correct for spurious high bit
+    const __m128i correct =
+        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+    // We deliberately carry the leading four bits in highbyte if they are
+    // present, we remove them later when computing hightenbits.
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+    // When we need to generate a surrogate pair (leading byte > 0xF0), then
+    // the corresponding 32-bit value in 'composed'  will be greater than
+    // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
+    // location of the surrogate pairs.
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+    const __m128i composedminus =
+        _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
+    const __m128i lowtenbits =
+        _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
+    // Notice the 0x3ff mask:
+    const __m128i hightenbits =
+        _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
+    const __m128i lowtenbitsadd =
+        _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
+    const __m128i hightenbitsadd =
+        _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
+    const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
+    __m128i surrogates = _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
+    uint32_t basic_buffer[4];
+    uint32_t basic_buffer_swap[4];
+    if (big_endian) {
+      _mm_storeu_si128((__m128i *)basic_buffer_swap,
+                       _mm_shuffle_epi8(composed, swap));
+      surrogates = _mm_shuffle_epi8(surrogates, swap);
     }
-    // If the input is long enough, then we have that margin-1 is the eight last
-    // leading byte.
-    const size_t safety_margin = size - margin + 1; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      if (input.is_ascii()) {
-        input.store_ascii_as_utf16<endian>(utf16_output);
-        utf16_output += 64;
-        pos += 64;
+    _mm_storeu_si128((__m128i *)basic_buffer, composed);
+    uint32_t surrogate_buffer[4];
+    _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
+    for (size_t i = 0; i < 3; i++) {
+      if (basic_buffer[i] > 0x3c00000) {
+        utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
+        utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
+        utf16_output += 2;
       } else {
-        // you might think that a for-loop would work, but under Visual Studio,
-        // it is not good enough.
-        static_assert(
-            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        auto zero = simd8<uint8_t>{uint8_t(0)};
-        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-        }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        if (utf8_continuation_mask & 1) {
-          return 0; // error
-        }
-        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-        // We process in blocks of up to 12 bytes except possibly
-        // for fast paths which may process up to 16 bytes. For the
-        // slow path to work, we should have at least 12 input bytes left.
-        size_t max_starting_point = (pos + 64) - 12;
-        // Next loop is going to run at least five times.
-        while (pos < max_starting_point) {
-          // Performance note: our ability to compute 'consumed' and
-          // then shift and recompute is critical. If there is a
-          // latency of, say, 4 cycles on getting 'consumed', then
-          // the inner loop might have a total latency of about 6 cycles.
-          // Yet we process between 6 to 12 inputs bytes, thus we get
-          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-          // for this section of the code. Hence, there is a limit
-          // to how much we can further increase this latency before
-          // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf16<endian>(
-              in + pos, utf8_end_of_code_point_mask, utf16_output);
-          pos += consumed;
-          utf8_end_of_code_point_mask >>= consumed;
-        }
-        // At this point there may remain between 0 and 12 bytes in the
-        // 64-byte block. These bytes will be processed again. So we have an
-        // 80% efficiency (in the worst case). In practice we expect an
-        // 85% to 90% efficiency.
+        utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i])
+                                     : uint16_t(basic_buffer[i]);
+        utf16_output++;
       }
     }
-    if (errors()) {
-      return 0;
+  } else {
+    // here we know that there is an error but we do not handle errors
+  }
+  return consumed;
+}
+/* end file src/haswell/avx2_convert_utf8_to_utf16.cpp */
+/* begin file src/haswell/avx2_convert_utf8_to_utf32.cpp */
+// depends on "tables/utf8_to_utf16_tables.h"
+
+// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+size_t convert_masked_utf8_to_utf32(const char *input,
+                                    uint64_t utf8_end_of_code_point_mask,
+                                    char32_t *&utf32_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it
+  // is maybe beneficial to have fast paths that depend on branch prediction but
+  // have less latency. This results in more instructions but, potentially, also
+  // higher speeds.
+  //
+  // We first try a few fast paths.
+  const __m128i in = _mm_loadu_si128((__m128i *)input);
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xfff;
+  if (utf8_end_of_code_point_mask == 0xfff) {
+    // We process the data in chunks of 12 bytes.
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output),
+                        _mm256_cvtepu8_epi32(in));
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8),
+                        _mm256_cvtepu8_epi32(_mm_srli_si128(in, 8)));
+    utf32_output += 12; // We wrote 12 32-bit characters.
+    return 12;          // We consumed 12 bytes.
+  }
+  if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
+    // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte
+    // UTF-32 code units. There is probably a more efficient sequence, but the
+    // following might do.
+    const __m128i sh =
+        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+    _mm256_storeu_si256((__m256i *)utf32_output,
+                        _mm256_cvtepu16_epi32(composed));
+    utf32_output += 8; // We wrote 16 bytes, 8 code points.
+    return 16;
+  }
+  if (input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
+    // UTF-32 code units. There is probably a more efficient sequence, but the
+    // following might do.
+    const __m128i sh =
+        _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii =
+        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+    const __m128i middlebyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    const __m128i highbyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+    _mm_storeu_si128((__m128i *)utf32_output, composed);
+    utf32_output += 4;
+    return 12;
+  }
+  /// We do not have a fast path available, so we fallback.
+
+  const uint8_t idx =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+  if (idx < 64) {
+    // SIX (6) input code-code units
+    // this is a relatively easy scenario
+    // we process SIX (6) input code-code units. The max length in bytes of six
+    // code code units spanning between 1 and 2 bytes each is 12 bytes. On
+    // processors where pdep/pext is fast, we might be able to use a small
+    // lookup table.
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+    _mm256_storeu_si256((__m256i *)utf32_output,
+                        _mm256_cvtepu16_epi32(composed));
+    utf32_output += 6; // We wrote 24 bytes, 6 code points. There is a potential
+    // overflow of 32 - 24 = 8 bytes.
+  } else if (idx < 145) {
+    // FOUR (4) input code-code units
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii =
+        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+    const __m128i middlebyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    const __m128i highbyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+    _mm_storeu_si128((__m128i *)utf32_output, composed);
+    utf32_output += 4;
+  } else if (idx < 209) {
+    // TWO (2) input code-code units
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+    // correct for spurious high bit
+    const __m128i correct =
+        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+    _mm_storeu_si128((__m128i *)utf32_output, composed);
+    utf32_output +=
+        3; // We wrote 3 * 4 bytes, there is a potential overflow of 4 bytes.
+  } else {
+    // here we know that there is an error but we do not handle errors
+  }
+  return consumed;
+}
+/* end file src/haswell/avx2_convert_utf8_to_utf32.cpp */
+
+/* begin file src/haswell/avx2_convert_utf16_to_latin1.cpp */
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+avx2_convert_utf16_to_latin1(const char16_t *buf, size_t len,
+                             char *latin1_output) {
+  const char16_t *end = buf + len;
+  while (end - buf >= 16) {
+    // Load 16 UTF-16 characters into 256-bit AVX2 register
+    __m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf));
+
+    if (!match_system(big_endian)) {
+      const __m256i swap = _mm256_setr_epi8(
+          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+      in = _mm256_shuffle_epi8(in, swap);
     }
-    if (pos < size) {
-      size_t howmany = scalar::utf8_to_utf16::convert<endian>(
-          in + pos, size - pos, utf16_output);
-      if (howmany == 0) {
-        return 0;
-      }
-      utf16_output += howmany;
+
+    __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00);
+    if (_mm256_testz_si256(in, high_byte_mask)) {
+      // Pack 16-bit characters into 8-bit and store in latin1_output
+      __m128i lo = _mm256_extractf128_si256(in, 0);
+      __m128i hi = _mm256_extractf128_si256(in, 1);
+      __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo);
+      __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi);
+      _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
+                       latin1_packed_lo);
+      _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8),
+                       latin1_packed_hi);
+      // Adjust pointers for next iteration
+      buf += 16;
+      latin1_output += 16;
+    } else {
+      return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
     }
-    return utf16_output - start;
-  }
+  } // while
+  return std::make_pair(buf, latin1_output);
+}
 
-  template <endianness endian>
-  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
-                                                   char16_t *utf16_output) {
-    size_t pos = 0;
-    char16_t *start{utf16_output};
-    // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
-    // last 16 bytes, and if the data is valid, then it is entirely safe because
-    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
-    // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 8 leading bytes, to give us a good margin.
-    size_t leading_byte = 0;
-    size_t margin = size;
-    for (; margin > 0 && leading_byte < 8; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) > -65);
+template <endianness big_endian>
+std::pair<result, char *>
+avx2_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
+                                         char *latin1_output) {
+  const char16_t *start = buf;
+  const char16_t *end = buf + len;
+  while (end - buf >= 16) {
+    __m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf));
+
+    if (!match_system(big_endian)) {
+      const __m256i swap = _mm256_setr_epi8(
+          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+      in = _mm256_shuffle_epi8(in, swap);
     }
-    // If the input is long enough, then we have that margin-1 is the eight last
-    // leading byte.
-    const size_t safety_margin = size - margin + 1; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      if (input.is_ascii()) {
-        input.store_ascii_as_utf16<endian>(utf16_output);
-        utf16_output += 64;
-        pos += 64;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio,
-        // it is not good enough.
-        static_assert(
-            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        auto zero = simd8<uint8_t>{uint8_t(0)};
-        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-        }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        if (errors() || (utf8_continuation_mask & 1)) {
-          // rewind_and_convert_with_errors will seek a potential error from
-          // in+pos onward, with the ability to go back up to pos bytes, and
-          // read size-pos bytes forward.
-          result res =
-              scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
-                  pos, in + pos, size - pos, utf16_output);
-          res.count += pos;
-          return res;
-        }
-        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-        // We process in blocks of up to 12 bytes except possibly
-        // for fast paths which may process up to 16 bytes. For the
-        // slow path to work, we should have at least 12 input bytes left.
-        size_t max_starting_point = (pos + 64) - 12;
-        // Next loop is going to run at least five times.
-        while (pos < max_starting_point) {
-          // Performance note: our ability to compute 'consumed' and
-          // then shift and recompute is critical. If there is a
-          // latency of, say, 4 cycles on getting 'consumed', then
-          // the inner loop might have a total latency of about 6 cycles.
-          // Yet we process between 6 to 12 inputs bytes, thus we get
-          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-          // for this section of the code. Hence, there is a limit
-          // to how much we can further increase this latency before
-          // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf16<endian>(
-              in + pos, utf8_end_of_code_point_mask, utf16_output);
-          pos += consumed;
-          utf8_end_of_code_point_mask >>= consumed;
+
+    __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00);
+    if (_mm256_testz_si256(in, high_byte_mask)) {
+      __m128i lo = _mm256_extractf128_si256(in, 0);
+      __m128i hi = _mm256_extractf128_si256(in, 1);
+      __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo);
+      __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi);
+      _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
+                       latin1_packed_lo);
+      _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8),
+                       latin1_packed_hi);
+      buf += 16;
+      latin1_output += 16;
+    } else {
+      // Fallback to scalar code for handling errors
+      for (int k = 0; k < 16; k++) {
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
+        if (word <= 0xff) {
+          *latin1_output++ = char(word);
+        } else {
+          return std::make_pair(
+              result{error_code::TOO_LARGE, (size_t)(buf - start + k)},
+              latin1_output);
         }
-        // At this point there may remain between 0 and 12 bytes in the
-        // 64-byte block. These bytes will be processed again. So we have an
-        // 80% efficiency (in the worst case). In practice we expect an
-        // 85% to 90% efficiency.
-      }
-    }
-    if (errors()) {
-      // rewind_and_convert_with_errors will seek a potential error from in+pos
-      // onward, with the ability to go back up to pos bytes, and read size-pos
-      // bytes forward.
-      result res =
-          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
-              pos, in + pos, size - pos, utf16_output);
-      res.count += pos;
-      return res;
-    }
-    if (pos < size) {
-      // rewind_and_convert_with_errors will seek a potential error from in+pos
-      // onward, with the ability to go back up to pos bytes, and read size-pos
-      // bytes forward.
-      result res =
-          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
-              pos, in + pos, size - pos, utf16_output);
-      if (res.error) { // In case of error, we want the error position
-        res.count += pos;
-        return res;
-      } else { // In case of success, we want the number of word written
-        utf16_output += res.count;
       }
+      buf += 16;
     }
-    return result(error_code::SUCCESS, utf16_output - start);
-  }
+  } // while
+  return std::make_pair(result{error_code::SUCCESS, (size_t)(buf - start)},
+                        latin1_output);
+}
+/* end file src/haswell/avx2_convert_utf16_to_latin1.cpp */
+/* begin file src/haswell/avx2_convert_utf16_to_utf8.cpp */
+/*
+    The vectorized algorithm works on single SSE register i.e., it
+    loads eight 16-bit code units.
 
-  simdutf_really_inline bool errors() const {
-    return this->error.any_bits_set_anywhere();
-  }
+    We consider three cases:
+    1. an input register contains no surrogates and each value
+       is in range 0x0000 .. 0x07ff.
+    2. an input register contains no surrogates and values are
+       is in range 0x0000 .. 0xffff.
+    3. an input register contains surrogates --- i.e. codepoints
+       can have 16 or 32 bits.
 
-}; // struct utf8_checker
-} // namespace utf8_to_utf16
-} // unnamed namespace
-} // namespace arm64
-} // namespace simdutf
-/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
-/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+    Ad 1.
 
-namespace simdutf {
-namespace arm64 {
-namespace {
-namespace utf8_to_utf16 {
+    When values are less than 0x0800, it means that a 16-bit code unit
+    can be converted into: 1) single UTF8 byte (when it is an ASCII
+    char) or 2) two UTF8 bytes.
 
-using namespace simd;
+    For this case we do only some shuffle to obtain these 2-byte
+    codes and finally compress the whole SSE register with a single
+    shuffle.
 
-template <endianness endian>
-simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
-                                         char16_t *utf16_output) noexcept {
-  // The implementation is not specific to haswell and should be moved to the
-  // generic directory.
-  size_t pos = 0;
-  char16_t *start{utf16_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while (pos + 64 + safety_margin <= size) {
-    // this loop could be unrolled further. For example, we could process the
-    // mask far more than 64 bytes.
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if (in.is_ascii()) {
-      in.store_ascii_as_utf16<endian>(utf16_output);
-      utf16_output += 64;
-      pos += 64;
-    } else {
-      // Slow path. We hope that the compiler will recognize that this is a slow
-      // path. Anything that is not a continuation mask is a 'leading byte',
-      // that is, the start of a new code point.
-      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-      // -65 is 0b10111111 in two-complement's, so largest possible continuation
-      // byte
-      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-      // The *start* of code points is not so useful, rather, we want the *end*
-      // of code points.
-      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-      // We process in blocks of up to 12 bytes except possibly
-      // for fast paths which may process up to 16 bytes. For the
-      // slow path to work, we should have at least 12 input bytes left.
-      size_t max_starting_point = (pos + 64) - 12;
-      // Next loop is going to run at least five times when using solely
-      // the slow/regular path, and at least four times if there are fast paths.
-      while (pos < max_starting_point) {
-        // Performance note: our ability to compute 'consumed' and
-        // then shift and recompute is critical. If there is a
-        // latency of, say, 4 cycles on getting 'consumed', then
-        // the inner loop might have a total latency of about 6 cycles.
-        // Yet we process between 6 to 12 inputs bytes, thus we get
-        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-        // for this section of the code. Hence, there is a limit
-        // to how much we can further increase this latency before
-        // it seriously harms performance.
-        //
-        // Thus we may allow convert_masked_utf8_to_utf16 to process
-        // more bytes at a time under a fast-path mode where 16 bytes
-        // are consumed at once (e.g., when encountering ASCII).
-        size_t consumed = convert_masked_utf8_to_utf16<endian>(
-            input + pos, utf8_end_of_code_point_mask, utf16_output);
-        pos += consumed;
-        utf8_end_of_code_point_mask >>= consumed;
-      }
-      // At this point there may remain between 0 and 12 bytes in the
-      // 64-byte block. These bytes will be processed again. So we have an
-      // 80% efficiency (in the worst case). In practice we expect an
-      // 85% to 90% efficiency.
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
+
+    Ad 2.
+
+    When values fit in 16-bit code units, but are above 0x07ff, then
+    a single word may produce one, two or three UTF8 bytes.
+
+    We prepare data for all these three cases in two registers.
+    The first register contains lower two UTF8 bytes (used in all
+    cases), while the second one contains just the third byte for
+    the three-UTF8-bytes case.
+
+    Finally these two registers are interleaved forming eight-element
+    array of 32-bit values. The array spans two SSE registers.
+    The bytes from the registers are compressed using two shuffles.
+
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
+
+
+    To summarize:
+    - We need two 256-entry tables that have 8704 bytes in total.
+*/
+
+/*
+  Returns a pair: the first unprocessed byte from buf and utf8_output
+  A scalar routing should carry on the conversion of the tail.
+*/
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+avx2_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
+  const char16_t *end = buf + len;
+  const __m256i v_0000 = _mm256_setzero_si256();
+  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+  const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+    if (big_endian) {
+      const __m256i swap = _mm256_setr_epi8(
+          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+      in = _mm256_shuffle_epi8(in, swap);
     }
-  }
-  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
-      input + pos, size - pos, utf16_output);
-  return utf16_output - start;
-}
+    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+    const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
+    if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
+      // 1. pack the bytes
+      const __m128i utf8_packed = _mm_packus_epi16(
+          _mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
+      // 2. store (16 bytes)
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+      // 3. adjust pointers
+      buf += 16;
+      utf8_output += 16;
+      continue; // we are done for this round!
+    }
+    // no bits set above 7th bit
+    const __m256i one_byte_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
+    const uint32_t one_byte_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
 
-} // namespace utf8_to_utf16
-} // unnamed namespace
-} // namespace arm64
-} // namespace simdutf
-/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
-// transcoding from UTF-8 to UTF-32
-/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
+    // no bits set above 11th bit
+    const __m256i one_or_two_bytes_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
+    const uint32_t one_or_two_bytes_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+    if (one_or_two_bytes_bitmask == 0xffffffff) {
 
-namespace simdutf {
-namespace arm64 {
-namespace {
-namespace utf8_to_utf32 {
-using namespace simd;
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
 
-simdutf_really_inline simd8<uint8_t>
-check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-  // Bit 1 = Too Long (ASCII followed by continuation)
-  // Bit 2 = Overlong 3-byte
-  // Bit 4 = Surrogate
-  // Bit 5 = Overlong 2-byte
-  // Bit 7 = Two Continuations
-  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
-                                               // 11______ 11______
-  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
-  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
-  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
-  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
-                                               // 11110100 101_____
-                                               // 11110101 1001____
-                                               // 11110101 101_____
-                                               // 1111011_ 1001____
-                                               // 1111011_ 101_____
-                                               // 11111___ 1001____
-                                               // 11111___ 101_____
-  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-  // 11110101 1000____
-  // 1111011_ 1000____
-  // 11111___ 1000____
-  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+      // t0 = [000a|aaaa|bbbb|bb00]
+      const __m256i t0 = _mm256_slli_epi16(in, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+      // t2 = [0000|0000|00bb|bbbb]
+      const __m256i t2 = _mm256_and_si256(in, v_003f);
+      // t3 = [000a|aaaa|00bb|bbbb]
+      const __m256i t3 = _mm256_or_si256(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      const __m256i t4 = _mm256_or_si256(t3, v_c080);
 
-  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
-  constexpr const uint8_t CARRY =
-      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
-  const simd8<uint8_t> byte_1_low =
-      (prev1 & 0x0F)
-          .lookup_16<uint8_t>(
-              // ____0000 ________
-              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-              // ____0001 ________
-              CARRY | OVERLONG_2,
-              // ____001_ ________
-              CARRY, CARRY,
+      // 2. merge ASCII and 2-byte codewords
+      const __m256i utf8_unpacked =
+          _mm256_blendv_epi8(t4, in, one_byte_bytemask);
 
-              // ____0100 ________
-              CARRY | TOO_LARGE,
-              // ____0101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____011_ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // 3. prepare bitmask for 8-bit lookup
+      const uint32_t M0 = one_byte_bitmask & 0x55555555;
+      const uint32_t M1 = M0 >> 7;
+      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+      // 4. pack the bytes
 
-              // ____1___ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____1101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000);
-  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT,
+      const uint8_t *row =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+      const uint8_t *row_2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
+                                                                       16)][0];
 
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
-          OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
 
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
-  return (byte_1_high & byte_1_low & byte_2_high);
-}
-simdutf_really_inline simd8<uint8_t>
-check_multibyte_lengths(const simd8<uint8_t> input,
-                        const simd8<uint8_t> prev_input,
-                        const simd8<uint8_t> sc) {
-  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
-  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
-  simd8<uint8_t> must23 =
-      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
-  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
-  return must23_80 ^ sc;
-}
+      const __m256i utf8_packed = _mm256_shuffle_epi8(
+          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+      // 5. store bytes
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_castsi256_si128(utf8_packed));
+      utf8_output += row[0];
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_extractf128_si256(utf8_packed, 1));
+      utf8_output += row_2[0];
+
+      // 6. adjust pointers
+      buf += 16;
+      continue;
+    }
+    // 1. Check if there are any surrogate word in the input chunk.
+    //    We have also deal with situation when there is a surrogate word
+    //    at the end of a chunk.
+    const __m256i surrogates_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+    // bitmask = 0x0000 if there are no surrogates
+    //         = 0xc000 if the last word is a surrogate
+    const uint32_t surrogates_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+    // However, it is likely an uncommon occurrence.
+    if (surrogates_bitmask == 0x00000000) {
+      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+      const __m256i dup_even = _mm256_setr_epi16(
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+      /* In this branch we handle three cases:
+         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+        single UFT-8 byte
+         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
+        UTF-8 bytes
+         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+        three UTF-8 bytes
 
-struct validating_transcoder {
-  // If this is nonzero, there has been a UTF-8 error.
-  simd8<uint8_t> error;
+        We expand the input word (16-bit) into two code units (32-bit), thus
+        we have room for four bytes. However, we need five distinct bit
+        layouts. Note that the last byte in cases #2 and #3 is the same.
 
-  validating_transcoder() : error(uint8_t(0)) {}
-  //
-  // Check whether the current bytes are valid UTF-8.
-  //
-  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
-                                              const simd8<uint8_t> prev_input) {
-    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
-    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
-    // small negative numbers)
-    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-    simd8<uint8_t> sc = check_special_cases(input, prev1);
-    this->error |= check_multibyte_lengths(input, prev_input, sc);
-  }
+        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+        in register t2.
 
-  simdutf_really_inline size_t convert(const char *in, size_t size,
-                                       char32_t *utf32_output) {
-    size_t pos = 0;
-    char32_t *start{utf32_output};
-    // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
-    // last 16 bytes, and if the data is valid, then it is entirely safe because
-    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
-    // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 16 leading bytes, to give us a good margin.
-    size_t leading_byte = 0;
-    size_t margin = size;
-    for (; margin > 0 && leading_byte < 8; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) > -65);
-    }
-    // If the input is long enough, then we have that margin-1 is the fourth
-    // last leading byte.
-    const size_t safety_margin = size - margin + 1; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      if (input.is_ascii()) {
-        input.store_ascii_as_utf32(utf32_output);
-        utf32_output += 64;
-        pos += 64;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio,
-        // it is not good enough.
-        static_assert(
-            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        auto zero = simd8<uint8_t>{uint8_t(0)};
-        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-        }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        if (utf8_continuation_mask & 1) {
-          return 0; // we have an error
-        }
-        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-        // We process in blocks of up to 12 bytes except possibly
-        // for fast paths which may process up to 16 bytes. For the
-        // slow path to work, we should have at least 12 input bytes left.
-        size_t max_starting_point = (pos + 64) - 12;
-        // Next loop is going to run at least five times.
-        while (pos < max_starting_point) {
-          // Performance note: our ability to compute 'consumed' and
-          // then shift and recompute is critical. If there is a
-          // latency of, say, 4 cycles on getting 'consumed', then
-          // the inner loop might have a total latency of about 6 cycles.
-          // Yet we process between 6 to 12 inputs bytes, thus we get
-          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-          // for this section of the code. Hence, there is a limit
-          // to how much we can further increase this latency before
-          // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf32(
-              in + pos, utf8_end_of_code_point_mask, utf32_output);
-          pos += consumed;
-          utf8_end_of_code_point_mask >>= consumed;
-        }
-        // At this point there may remain between 0 and 12 bytes in the
-        // 64-byte block. These bytes will be processed again. So we have an
-        // 80% efficiency (in the worst case). In practice we expect an
-        // 85% to 90% efficiency.
-      }
-    }
-    if (errors()) {
-      return 0;
-    }
-    if (pos < size) {
-      size_t howmany =
-          scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
-      if (howmany == 0) {
-        return 0;
-      }
-      utf32_output += howmany;
-    }
-    return utf32_output - start;
-  }
+        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+        either byte 1 for case #2 or byte 2 for case #3. Note that they
+        differ by exactly one bit.
 
-  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
-                                                   char32_t *utf32_output) {
-    size_t pos = 0;
-    char32_t *start{utf32_output};
-    // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
-    // last 16 bytes, and if the data is valid, then it is entirely safe because
-    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
-    // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 8 leading bytes, to give us a good margin.
-    size_t leading_byte = 0;
-    size_t margin = size;
-    for (; margin > 0 && leading_byte < 8; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) > -65);
-    }
-    // If the input is long enough, then we have that margin-1 is the fourth
-    // last leading byte.
-    const size_t safety_margin = size - margin + 1; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      if (input.is_ascii()) {
-        input.store_ascii_as_utf32(utf32_output);
-        utf32_output += 64;
-        pos += 64;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio,
-        // it is not good enough.
-        static_assert(
-            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        auto zero = simd8<uint8_t>{uint8_t(0)};
-        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-        }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        if (errors() || (utf8_continuation_mask & 1)) {
-          result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
-              pos, in + pos, size - pos, utf32_output);
-          res.count += pos;
-          return res;
-        }
-        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-        // We process in blocks of up to 12 bytes except possibly
-        // for fast paths which may process up to 16 bytes. For the
-        // slow path to work, we should have at least 12 input bytes left.
-        size_t max_starting_point = (pos + 64) - 12;
-        // Next loop is going to run at least five times.
-        while (pos < max_starting_point) {
-          // Performance note: our ability to compute 'consumed' and
-          // then shift and recompute is critical. If there is a
-          // latency of, say, 4 cycles on getting 'consumed', then
-          // the inner loop might have a total latency of about 6 cycles.
-          // Yet we process between 6 to 12 inputs bytes, thus we get
-          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-          // for this section of the code. Hence, there is a limit
-          // to how much we can further increase this latency before
-          // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf32(
-              in + pos, utf8_end_of_code_point_mask, utf32_output);
-          pos += consumed;
-          utf8_end_of_code_point_mask >>= consumed;
-        }
-        // At this point there may remain between 0 and 12 bytes in the
-        // 64-byte block. These bytes will be processed again. So we have an
-        // 80% efficiency (in the worst case). In practice we expect an
-        // 85% to 90% efficiency.
-      }
-    }
-    if (errors()) {
-      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
-          pos, in + pos, size - pos, utf32_output);
-      res.count += pos;
-      return res;
-    }
-    if (pos < size) {
-      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
-          pos, in + pos, size - pos, utf32_output);
-      if (res.error) { // In case of error, we want the error position
-        res.count += pos;
-        return res;
-      } else { // In case of success, we want the number of word written
-        utf32_output += res.count;
-      }
-    }
-    return result(error_code::SUCCESS, utf32_output - start);
-  }
+        Finally from these two code units we build proper UTF-8 sequence, taking
+        into account the case (i.e, the number of bytes to write).
+      */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
 
-  simdutf_really_inline bool errors() const {
-    return this->error.any_bits_set_anywhere();
-  }
+      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+      const __m256i s0 = _mm256_srli_epi16(in, 4);
+      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
+                                             simdutf_vec(0b0100000000000000));
+      const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
 
-}; // struct utf8_checker
-} // namespace utf8_to_utf32
-} // unnamed namespace
-} // namespace arm64
-} // namespace simdutf
-/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
-/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
+      // 4. expand code units 16-bit => 32-bit
+      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
 
-namespace simdutf {
-namespace arm64 {
-namespace {
-namespace utf8_to_utf32 {
+      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+      // Due to the wider registers, the following path is less likely to be
+      // useful.
+      /*if(mask == 0) {
+        // We only have three-byte code units. Use fast path.
+        const __m256i shuffle =
+      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
+      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
+      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
+      _mm256_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+        continue;
+      }*/
+      const uint8_t mask0 = uint8_t(mask);
+      const uint8_t *row0 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+      const __m128i utf8_0 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
 
-using namespace simd;
+      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+      const uint8_t *row1 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+      const __m128i utf8_1 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
 
-simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
-                                         char32_t *utf32_output) noexcept {
-  size_t pos = 0;
-  char32_t *start{utf32_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while (pos + 64 + safety_margin <= size) {
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if (in.is_ascii()) {
-      in.store_ascii_as_utf32(utf32_output);
-      utf32_output += 64;
-      pos += 64;
+      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+      const uint8_t *row2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
+      const __m128i utf8_2 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+      const uint8_t *row3 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
+      const __m128i utf8_3 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+      utf8_output += row0[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+      utf8_output += row1[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
+      utf8_output += row2[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
+      utf8_output += row3[0];
+      buf += 16;
+      // surrogate pair(s) in a register
     } else {
-      // -65 is 0b10111111 in two-complement's, so largest possible continuation
-      // byte
-      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-      size_t max_starting_point = (pos + 64) - 12;
-      while (pos < max_starting_point) {
-        size_t consumed = convert_masked_utf8_to_utf32(
-            input + pos, utf8_end_of_code_point_mask, utf32_output);
-        pos += consumed;
-        utf8_end_of_code_point_mask >>= consumed;
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        if ((word & 0xFF80) == 0) {
+          *utf8_output++ = char(word);
+        } else if ((word & 0xF800) == 0) {
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xF800) != 0xD800) {
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word =
+              big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(nullptr, utf8_output);
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf8_output++ = char((value >> 18) | 0b11110000);
+          *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+        }
       }
+      buf += k;
     }
-  }
-  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
-                                                       utf32_output);
-  return utf32_output - start;
+  } // while
+  return std::make_pair(buf, utf8_output);
 }
 
-} // namespace utf8_to_utf32
-} // unnamed namespace
-} // namespace arm64
-} // namespace simdutf
-/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
-// other functions
-/* begin file src/generic/utf16.h */
-namespace simdutf {
-namespace arm64 {
-namespace {
-namespace utf16 {
-
+/*
+  Returns a pair: a result struct and utf8_output.
+  If there is an error, the count field of the result is the position of the
+  error. Otherwise, it is the position of the first unprocessed byte in buf
+  (even if finished). A scalar routing should carry on the conversion of the
+  tail if needed.
+*/
 template <endianness big_endian>
-simdutf_really_inline size_t count_code_points(const char16_t *in,
-                                               size_t size) {
-  size_t pos = 0;
-  size_t count = 0;
-  for (; pos < size / 32 * 32; pos += 32) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    if (!match_system(big_endian)) {
-      input.swap_bytes();
-    }
-    uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
-    count += count_ones(not_pair) / 2;
-  }
-  return count +
-         scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
-}
+std::pair<result, char *>
+avx2_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
+                                       char *utf8_output) {
+  const char16_t *start = buf;
+  const char16_t *end = buf + len;
 
-template <endianness big_endian>
-simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in,
-                                                    size_t size) {
-  size_t pos = 0;
-  size_t count = 0;
-  // This algorithm could no doubt be improved!
-  for (; pos < size / 32 * 32; pos += 32) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    if (!match_system(big_endian)) {
-      input.swap_bytes();
+  const __m256i v_0000 = _mm256_setzero_si256();
+  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+  const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+    if (big_endian) {
+      const __m256i swap = _mm256_setr_epi8(
+          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+      in = _mm256_shuffle_epi8(in, swap);
     }
-    uint64_t ascii_mask = input.lteq(0x7F);
-    uint64_t twobyte_mask = input.lteq(0x7FF);
-    uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+    const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
+    if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
+      // 1. pack the bytes
+      const __m128i utf8_packed = _mm_packus_epi16(
+          _mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
+      // 2. store (16 bytes)
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+      // 3. adjust pointers
+      buf += 16;
+      utf8_output += 16;
+      continue; // we are done for this round!
+    }
+    // no bits set above 7th bit
+    const __m256i one_byte_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
+    const uint32_t one_byte_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
 
-    size_t ascii_count = count_ones(ascii_mask) / 2;
-    size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
-    size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
-    size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
-    count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count +
-             ascii_count;
-  }
-  return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
-                                                                   size - pos);
-}
+    // no bits set above 11th bit
+    const __m256i one_or_two_bytes_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
+    const uint32_t one_or_two_bytes_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+    if (one_or_two_bytes_bitmask == 0xffffffff) {
 
-template <endianness big_endian>
-simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
-                                                     size_t size) {
-  return count_code_points<big_endian>(in, size);
-}
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
 
-simdutf_really_inline void
-change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
-  size_t pos = 0;
+      // t0 = [000a|aaaa|bbbb|bb00]
+      const __m256i t0 = _mm256_slli_epi16(in, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+      // t2 = [0000|0000|00bb|bbbb]
+      const __m256i t2 = _mm256_and_si256(in, v_003f);
+      // t3 = [000a|aaaa|00bb|bbbb]
+      const __m256i t3 = _mm256_or_si256(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      const __m256i t4 = _mm256_or_si256(t3, v_c080);
 
-  while (pos < size / 32 * 32) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    input.swap_bytes();
-    input.store(reinterpret_cast<uint16_t *>(output));
-    pos += 32;
-    output += 32;
-  }
+      // 2. merge ASCII and 2-byte codewords
+      const __m256i utf8_unpacked =
+          _mm256_blendv_epi8(t4, in, one_byte_bytemask);
+
+      // 3. prepare bitmask for 8-bit lookup
+      const uint32_t M0 = one_byte_bitmask & 0x55555555;
+      const uint32_t M1 = M0 >> 7;
+      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+      // 4. pack the bytes
 
-  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
-}
+      const uint8_t *row =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+      const uint8_t *row_2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
+                                                                       16)][0];
 
-} // namespace utf16
-} // unnamed namespace
-} // namespace arm64
-} // namespace simdutf
-/* end file src/generic/utf16.h */
-/* begin file src/generic/utf8.h */
+      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
 
-namespace simdutf {
-namespace arm64 {
-namespace {
-namespace utf8 {
+      const __m256i utf8_packed = _mm256_shuffle_epi8(
+          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+      // 5. store bytes
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_castsi256_si128(utf8_packed));
+      utf8_output += row[0];
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_extractf128_si256(utf8_packed, 1));
+      utf8_output += row_2[0];
 
-using namespace simd;
+      // 6. adjust pointers
+      buf += 16;
+      continue;
+    }
+    // 1. Check if there are any surrogate word in the input chunk.
+    //    We have also deal with situation when there is a surrogate word
+    //    at the end of a chunk.
+    const __m256i surrogates_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
 
-simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
-  size_t pos = 0;
-  size_t count = 0;
-  for (; pos + 64 <= size; pos += 64) {
-    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-    uint64_t utf8_continuation_mask = input.gt(-65);
-    count += count_ones(utf8_continuation_mask);
-  }
-  return count + scalar::utf8::count_code_points(in + pos, size - pos);
-}
+    // bitmask = 0x0000 if there are no surrogates
+    //         = 0xc000 if the last word is a surrogate
+    const uint32_t surrogates_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+    // However, it is likely an uncommon occurrence.
+    if (surrogates_bitmask == 0x00000000) {
+      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+      const __m256i dup_even = _mm256_setr_epi16(
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
 
-simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
-                                                    size_t size) {
-  size_t pos = 0;
-  size_t count = 0;
-  // This algorithm could no doubt be improved!
-  for (; pos + 64 <= size; pos += 64) {
-    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-    uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-    // We count one word for anything that is not a continuation (so
-    // leading bytes).
-    count += 64 - count_ones(utf8_continuation_mask);
-    int64_t utf8_4byte = input.gteq_unsigned(240);
-    count += count_ones(utf8_4byte);
-  }
-  return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
-}
-} // namespace utf8
-} // unnamed namespace
-} // namespace arm64
-} // namespace simdutf
-/* end file src/generic/utf8.h */
-// transcoding from UTF-8 to Latin 1
-/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
+      /* In this branch we handle three cases:
+         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+        single UFT-8 byte
+         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
+        UTF-8 bytes
+         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+        three UTF-8 bytes
 
-namespace simdutf {
-namespace arm64 {
-namespace {
-namespace utf8_to_latin1 {
-using namespace simd;
+        We expand the input word (16-bit) into two code units (32-bit), thus
+        we have room for four bytes. However, we need five distinct bit
+        layouts. Note that the last byte in cases #2 and #3 is the same.
 
-simdutf_really_inline simd8<uint8_t>
-check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-  // For UTF-8 to Latin 1, we can allow any ASCII character, and any
-  // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
-  // 0b11000010 and nothing else.
-  //
-  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-  // Bit 1 = Too Long (ASCII followed by continuation)
-  // Bit 2 = Overlong 3-byte
-  // Bit 4 = Surrogate
-  // Bit 5 = Overlong 2-byte
-  // Bit 7 = Two Continuations
-  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
-                                               // 11______ 11______
-  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
-  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
-  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
-  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
-                                               // 11110100 101_____
-                                               // 11110101 1001____
-                                               // 11110101 101_____
-                                               // 1111011_ 1001____
-                                               // 1111011_ 101_____
-                                               // 11111___ 1001____
-                                               // 11111___ 101_____
-  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-  // 11110101 1000____
-  // 1111011_ 1000____
-  // 11111___ 1000____
-  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
-  constexpr const uint8_t FORBIDDEN = 0xff;
+        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+        in register t2.
 
-  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      FORBIDDEN,
-      // 1110____ ________ <three byte lead in byte 1>
-      FORBIDDEN,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      FORBIDDEN);
-  constexpr const uint8_t CARRY =
-      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
-  const simd8<uint8_t> byte_1_low =
-      (prev1 & 0x0F)
-          .lookup_16<uint8_t>(
-              // ____0000 ________
-              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-              // ____0001 ________
-              CARRY | OVERLONG_2,
-              // ____001_ ________
-              CARRY, CARRY,
+        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+        either byte 1 for case #2 or byte 2 for case #3. Note that they
+        differ by exactly one bit.
 
-              // ____0100 ________
-              FORBIDDEN,
-              // ____0101 ________
-              FORBIDDEN,
-              // ____011_ ________
-              FORBIDDEN, FORBIDDEN,
+        Finally from these two code units we build proper UTF-8 sequence, taking
+        into account the case (i.e, the number of bytes to write).
+      */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
 
-              // ____1___ ________
-              FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
-              // ____1101 ________
-              FORBIDDEN, FORBIDDEN, FORBIDDEN);
-  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT,
+      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+      const __m256i s0 = _mm256_srli_epi16(in, 4);
+      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
+                                             simdutf_vec(0b0100000000000000));
+      const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
 
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
-          OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      // 4. expand code units 16-bit => 32-bit
+      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
 
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
-  return (byte_1_high & byte_1_low & byte_2_high);
-}
+      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+      // Due to the wider registers, the following path is less likely to be
+      // useful.
+      /*if(mask == 0) {
+        // We only have three-byte code units. Use fast path.
+        const __m256i shuffle =
+      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
+      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
+      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
+      _mm256_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+        continue;
+      }*/
+      const uint8_t mask0 = uint8_t(mask);
+      const uint8_t *row0 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+      const __m128i utf8_0 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
 
-struct validating_transcoder {
-  // If this is nonzero, there has been a UTF-8 error.
-  simd8<uint8_t> error;
+      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+      const uint8_t *row1 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+      const __m128i utf8_1 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
 
-  validating_transcoder() : error(uint8_t(0)) {}
-  //
-  // Check whether the current bytes are valid UTF-8.
-  //
-  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
-                                              const simd8<uint8_t> prev_input) {
-    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
-    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
-    // small negative numbers)
-    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-    this->error |= check_special_cases(input, prev1);
-  }
+      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+      const uint8_t *row2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
+      const __m128i utf8_2 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
 
-  simdutf_really_inline size_t convert(const char *in, size_t size,
-                                       char *latin1_output) {
-    size_t pos = 0;
-    char *start{latin1_output};
-    // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
-    // last 16 bytes, and if the data is valid, then it is entirely safe because
-    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
-    // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 16 leading bytes, to give us a good margin.
-    size_t leading_byte = 0;
-    size_t margin = size;
-    for (; margin > 0 && leading_byte < 16; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) >
-                       -65); // twos complement of -65 is 1011 1111 ...
-    }
-    // If the input is long enough, then we have that margin-1 is the eight last
-    // leading byte.
-    const size_t safety_margin = size - margin + 1; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      if (input.is_ascii()) {
-        input.store((int8_t *)latin1_output);
-        latin1_output += 64;
-        pos += 64;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio,
-        // it is not good enough.
-        static_assert(
-            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        auto zero = simd8<uint8_t>{uint8_t(0)};
-        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-        }
-        uint64_t utf8_continuation_mask =
-            input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
-                               // this case, we also have ASCII to account for.
-        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-        // We process in blocks of up to 12 bytes except possibly
-        // for fast paths which may process up to 16 bytes. For the
-        // slow path to work, we should have at least 12 input bytes left.
-        size_t max_starting_point = (pos + 64) - 12;
-        // Next loop is going to run at least five times.
-        while (pos < max_starting_point) {
-          // Performance note: our ability to compute 'consumed' and
-          // then shift and recompute is critical. If there is a
-          // latency of, say, 4 cycles on getting 'consumed', then
-          // the inner loop might have a total latency of about 6 cycles.
-          // Yet we process between 6 to 12 inputs bytes, thus we get
-          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-          // for this section of the code. Hence, there is a limit
-          // to how much we can further increase this latency before
-          // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_latin1(
-              in + pos, utf8_end_of_code_point_mask, latin1_output);
-          pos += consumed;
-          utf8_end_of_code_point_mask >>= consumed;
-        }
-        // At this point there may remain between 0 and 12 bytes in the
-        // 64-byte block. These bytes will be processed again. So we have an
-        // 80% efficiency (in the worst case). In practice we expect an
-        // 85% to 90% efficiency.
-      }
-    }
-    if (errors()) {
-      return 0;
-    }
-    if (pos < size) {
-      size_t howmany =
-          scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
-      if (howmany == 0) {
-        return 0;
-      }
-      latin1_output += howmany;
-    }
-    return latin1_output - start;
-  }
+      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+      const uint8_t *row3 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
+      const __m128i utf8_3 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
 
-  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
-                                                   char *latin1_output) {
-    size_t pos = 0;
-    char *start{latin1_output};
-    // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
-    // last 16 bytes, and if the data is valid, then it is entirely safe because
-    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
-    // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 8 leading bytes, to give us a good margin.
-    size_t leading_byte = 0;
-    size_t margin = size;
-    for (; margin > 0 && leading_byte < 8; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) > -65);
-    }
-    // If the input is long enough, then we have that margin-1 is the eight last
-    // leading byte.
-    const size_t safety_margin = size - margin + 1; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      if (input.is_ascii()) {
-        input.store((int8_t *)latin1_output);
-        latin1_output += 64;
-        pos += 64;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio,
-        // it is not good enough.
-        static_assert(
-            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        auto zero = simd8<uint8_t>{uint8_t(0)};
-        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-        }
-        if (errors()) {
-          // rewind_and_convert_with_errors will seek a potential error from
-          // in+pos onward, with the ability to go back up to pos bytes, and
-          // read size-pos bytes forward.
-          result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
-              pos, in + pos, size - pos, latin1_output);
-          res.count += pos;
-          return res;
-        }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-        // We process in blocks of up to 12 bytes except possibly
-        // for fast paths which may process up to 16 bytes. For the
-        // slow path to work, we should have at least 12 input bytes left.
-        size_t max_starting_point = (pos + 64) - 12;
-        // Next loop is going to run at least five times.
-        while (pos < max_starting_point) {
-          // Performance note: our ability to compute 'consumed' and
-          // then shift and recompute is critical. If there is a
-          // latency of, say, 4 cycles on getting 'consumed', then
-          // the inner loop might have a total latency of about 6 cycles.
-          // Yet we process between 6 to 12 inputs bytes, thus we get
-          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-          // for this section of the code. Hence, there is a limit
-          // to how much we can further increase this latency before
-          // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_latin1(
-              in + pos, utf8_end_of_code_point_mask, latin1_output);
-          pos += consumed;
-          utf8_end_of_code_point_mask >>= consumed;
-        }
-        // At this point there may remain between 0 and 12 bytes in the
-        // 64-byte block. These bytes will be processed again. So we have an
-        // 80% efficiency (in the worst case). In practice we expect an
-        // 85% to 90% efficiency.
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+      utf8_output += row0[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+      utf8_output += row1[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
+      utf8_output += row2[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
+      utf8_output += row3[0];
+      buf += 16;
+      // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
       }
-    }
-    if (errors()) {
-      // rewind_and_convert_with_errors will seek a potential error from in+pos
-      // onward, with the ability to go back up to pos bytes, and read size-pos
-      // bytes forward.
-      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
-          pos, in + pos, size - pos, latin1_output);
-      res.count += pos;
-      return res;
-    }
-    if (pos < size) {
-      // rewind_and_convert_with_errors will seek a potential error from in+pos
-      // onward, with the ability to go back up to pos bytes, and read size-pos
-      // bytes forward.
-      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
-          pos, in + pos, size - pos, latin1_output);
-      if (res.error) { // In case of error, we want the error position
-        res.count += pos;
-        return res;
-      } else { // In case of success, we want the number of word written
-        latin1_output += res.count;
+      for (; k < forward; k++) {
+        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        if ((word & 0xFF80) == 0) {
+          *utf8_output++ = char(word);
+        } else if ((word & 0xF800) == 0) {
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xF800) != 0xD800) {
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word =
+              big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k - 1),
+                utf8_output);
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf8_output++ = char((value >> 18) | 0b11110000);
+          *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+        }
       }
-    }
-    return result(error_code::SUCCESS, latin1_output - start);
-  }
+      buf += k;
+    }
+  } // while
+  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+}
+/* end file src/haswell/avx2_convert_utf16_to_utf8.cpp */
+/* begin file src/haswell/avx2_convert_utf16_to_utf32.cpp */
+/*
+    The vectorized algorithm works on single SSE register i.e., it
+    loads eight 16-bit code units.
 
-  simdutf_really_inline bool errors() const {
-    return this->error.any_bits_set_anywhere();
-  }
+    We consider three cases:
+    1. an input register contains no surrogates and each value
+       is in range 0x0000 .. 0x07ff.
+    2. an input register contains no surrogates and values are
+       in range 0x0000 .. 0xffff.
+    3. an input register contains surrogates --- i.e. codepoints
+       can have 16 or 32 bits.
 
-}; // struct utf8_checker
-} // namespace utf8_to_latin1
-} // unnamed namespace
-} // namespace arm64
-} // namespace simdutf
-/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
-/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
+    Ad 1.
 
-namespace simdutf {
-namespace arm64 {
-namespace {
-namespace utf8_to_latin1 {
-using namespace simd;
+    When values are less than 0x0800, it means that a 16-bit code unit
+    can be converted into: 1) single UTF8 byte (when it is an ASCII
+    char) or 2) two UTF8 bytes.
 
-simdutf_really_inline size_t convert_valid(const char *in, size_t size,
-                                           char *latin1_output) {
-  size_t pos = 0;
-  char *start{latin1_output};
-  // In the worst case, we have the haswell kernel which can cause an overflow
-  // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
-  // 16 bytes, and if the data is valid, then it is entirely safe because 16
-  // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
-  // assume that you have valid UTF-8 input, so we are going to go back from the
-  // end counting 8 leading bytes, to give us a good margin.
-  size_t leading_byte = 0;
-  size_t margin = size;
-  for (; margin > 0 && leading_byte < 8; margin--) {
-    leading_byte += (int8_t(in[margin - 1]) >
-                     -65); // twos complement of -65 is 1011 1111 ...
-  }
-  // If the input is long enough, then we have that margin-1 is the eight last
-  // leading byte.
-  const size_t safety_margin = size - margin + 1; // to avoid overruns!
-  while (pos + 64 + safety_margin <= size) {
-    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-    if (input.is_ascii()) {
-      input.store((int8_t *)latin1_output);
-      latin1_output += 64;
-      pos += 64;
+    For this case we do only some shuffle to obtain these 2-byte
+    codes and finally compress the whole SSE register with a single
+    shuffle.
+
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
+
+    Ad 2.
+
+    When values fit in 16-bit code units, but are above 0x07ff, then
+    a single word may produce one, two or three UTF8 bytes.
+
+    We prepare data for all these three cases in two registers.
+    The first register contains lower two UTF8 bytes (used in all
+    cases), while the second one contains just the third byte for
+    the three-UTF8-bytes case.
+
+    Finally these two registers are interleaved forming eight-element
+    array of 32-bit values. The array spans two SSE registers.
+    The bytes from the registers are compressed using two shuffles.
+
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
+
+
+    To summarize:
+    - We need two 256-entry tables that have 8704 bytes in total.
+*/
+
+/*
+  Returns a pair: the first unprocessed byte from buf and utf32_output
+  A scalar routing should carry on the conversion of the tail.
+*/
+template <endianness big_endian>
+std::pair<const char16_t *, char32_t *>
+avx2_convert_utf16_to_utf32(const char16_t *buf, size_t len,
+                            char32_t *utf32_output) {
+  const char16_t *end = buf + len;
+  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+
+  while (end - buf >= 16) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+    if (big_endian) {
+      const __m256i swap = _mm256_setr_epi8(
+          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+      in = _mm256_shuffle_epi8(in, swap);
+    }
+
+    // 1. Check if there are any surrogate word in the input chunk.
+    //    We have also deal with situation when there is a surrogate word
+    //    at the end of a chunk.
+    const __m256i surrogates_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+    // bitmask = 0x0000 if there are no surrogates
+    //         = 0xc000 if the last word is a surrogate
+    const uint32_t surrogates_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+    // However, it is likely an uncommon occurrence.
+    if (surrogates_bitmask == 0x00000000) {
+      // case: we extend all sixteen 16-bit code units to sixteen 32-bit code
+      // units
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output),
+                          _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
+      _mm256_storeu_si256(
+          reinterpret_cast<__m256i *>(utf32_output + 8),
+          _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1)));
+      utf32_output += 16;
+      buf += 16;
+      // surrogate pair(s) in a register
     } else {
-      // you might think that a for-loop would work, but under Visual Studio, it
-      // is not good enough.
-      uint64_t utf8_continuation_mask =
-          input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
-                             // this case, we also have ASCII to account for.
-      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-      // We process in blocks of up to 12 bytes except possibly
-      // for fast paths which may process up to 16 bytes. For the
-      // slow path to work, we should have at least 12 input bytes left.
-      size_t max_starting_point = (pos + 64) - 12;
-      // Next loop is going to run at least five times.
-      while (pos < max_starting_point) {
-        // Performance note: our ability to compute 'consumed' and
-        // then shift and recompute is critical. If there is a
-        // latency of, say, 4 cycles on getting 'consumed', then
-        // the inner loop might have a total latency of about 6 cycles.
-        // Yet we process between 6 to 12 inputs bytes, thus we get
-        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-        // for this section of the code. Hence, there is a limit
-        // to how much we can further increase this latency before
-        // it seriously harms performance.
-        size_t consumed = convert_masked_utf8_to_latin1(
-            in + pos, utf8_end_of_code_point_mask, latin1_output);
-        pos += consumed;
-        utf8_end_of_code_point_mask >>= consumed;
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
       }
-      // At this point there may remain between 0 and 12 bytes in the
-      // 64-byte block. These bytes will be processed again. So we have an
-      // 80% efficiency (in the worst case). In practice we expect an
-      // 85% to 90% efficiency.
+      for (; k < forward; k++) {
+        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        if ((word & 0xF800) != 0xD800) {
+          // No surrogate pair
+          *utf32_output++ = char32_t(word);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word =
+              big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(nullptr, utf32_output);
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf32_output++ = char32_t(value);
+        }
+      }
+      buf += k;
     }
-  }
-  if (pos < size) {
-    size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
-                                                           latin1_output);
-    latin1_output += howmany;
-  }
-  return latin1_output - start;
+  } // while
+  return std::make_pair(buf, utf32_output);
 }
 
-} // namespace utf8_to_latin1
-} // namespace
-} // namespace arm64
-} // namespace simdutf
-  // namespace simdutf
-/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
+/*
+  Returns a pair: a result struct and utf8_output.
+  If there is an error, the count field of the result is the position of the
+  error. Otherwise, it is the position of the first unprocessed byte in buf
+  (even if finished). A scalar routing should carry on the conversion of the
+  tail if needed.
+*/
+template <endianness big_endian>
+std::pair<result, char32_t *>
+avx2_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
+                                        char32_t *utf32_output) {
+  const char16_t *start = buf;
+  const char16_t *end = buf + len;
+  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
 
-// placeholder scalars
+  while (end - buf >= 16) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+    if (big_endian) {
+      const __m256i swap = _mm256_setr_epi8(
+          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+      in = _mm256_shuffle_epi8(in, swap);
+    }
 
-//
-// Implementation-specific overrides
-//
-namespace simdutf {
-namespace arm64 {
+    // 1. Check if there are any surrogate word in the input chunk.
+    //    We have also deal with situation when there is a surrogate word
+    //    at the end of a chunk.
+    const __m256i surrogates_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
 
-simdutf_warn_unused int
-implementation::detect_encodings(const char *input,
-                                 size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  if (bom_encoding != encoding_type::unspecified) {
-    return bom_encoding;
-  }
-  // todo: reimplement as a one-pass algorithm.
-  int out = 0;
-  if (validate_utf8(input, length)) {
-    out |= encoding_type::UTF8;
-  }
-  if ((length % 2) == 0) {
-    if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
-                         length / 2)) {
-      out |= encoding_type::UTF16_LE;
-    }
-  }
-  if ((length % 4) == 0) {
-    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
-      out |= encoding_type::UTF32_LE;
+    // bitmask = 0x0000 if there are no surrogates
+    //         = 0xc000 if the last word is a surrogate
+    const uint32_t surrogates_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+    // However, it is likely an uncommon occurrence.
+    if (surrogates_bitmask == 0x00000000) {
+      // case: we extend all sixteen 16-bit code units to sixteen 32-bit code
+      // units
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output),
+                          _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
+      _mm256_storeu_si256(
+          reinterpret_cast<__m256i *>(utf32_output + 8),
+          _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1)));
+      utf32_output += 16;
+      buf += 16;
+      // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        if ((word & 0xF800) != 0xD800) {
+          // No surrogate pair
+          *utf32_output++ = char32_t(word);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word =
+              big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k - 1),
+                utf32_output);
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf32_output++ = char32_t(value);
+        }
+      }
+      buf += k;
     }
-  }
-  return out;
+  } // while
+  return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
 }
+/* end file src/haswell/avx2_convert_utf16_to_utf32.cpp */
 
-simdutf_warn_unused bool
-implementation::validate_utf8(const char *buf, size_t len) const noexcept {
-  return arm64::utf8_validation::generic_validate_utf8(buf, len);
-}
+/* begin file src/haswell/avx2_convert_utf32_to_latin1.cpp */
+std::pair<const char32_t *, char *>
+avx2_convert_utf32_to_latin1(const char32_t *buf, size_t len,
+                             char *latin1_output) {
+  const size_t rounded_len =
+      len & ~0x1F; // Round down to nearest multiple of 32
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(
-    const char *buf, size_t len) const noexcept {
-  return arm64::utf8_validation::generic_validate_utf8_with_errors(buf, len);
-}
+  __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00);
 
-simdutf_warn_unused bool
-implementation::validate_ascii(const char *buf, size_t len) const noexcept {
-  return arm64::utf8_validation::generic_validate_ascii(buf, len);
-}
+  __m256i shufmask = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                     -1, 12, 8, 4, 0, -1, -1, -1, -1, -1, -1,
+                                     -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(
-    const char *buf, size_t len) const noexcept {
-  return arm64::utf8_validation::generic_validate_ascii_with_errors(buf, len);
-}
+  for (size_t i = 0; i < rounded_len; i += 16) {
+    __m256i in1 = _mm256_loadu_si256((__m256i *)buf);
+    __m256i in2 = _mm256_loadu_si256((__m256i *)(buf + 8));
 
-simdutf_warn_unused bool
-implementation::validate_utf16le(const char16_t *buf,
-                                 size_t len) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
-    // empty input is valid. protected the implementation from nullptr.
-    return true;
-  }
-  const char16_t *tail = arm_validate_utf16<endianness::LITTLE>(buf, len);
-  if (tail) {
-    return scalar::utf16::validate<endianness::LITTLE>(tail,
-                                                       len - (tail - buf));
-  } else {
-    return false;
-  }
-}
+    __m256i check_combined = _mm256_or_si256(in1, in2);
 
-simdutf_warn_unused bool
-implementation::validate_utf16be(const char16_t *buf,
-                                 size_t len) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
-    // empty input is valid. protected the implementation from nullptr.
-    return true;
-  }
-  const char16_t *tail = arm_validate_utf16<endianness::BIG>(buf, len);
-  if (tail) {
-    return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
-}
+    if (!_mm256_testz_si256(check_combined, high_bytes_mask)) {
+      return std::make_pair(nullptr, latin1_output);
+    }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(
-    const char16_t *buf, size_t len) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
-    return result(error_code::SUCCESS, 0);
-  }
-  result res = arm_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
-  if (res.count != len) {
-    result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(
-        buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
-  }
-}
+    // Turn UTF32 bytes into latin 1 bytes
+    __m256i shuffled1 = _mm256_shuffle_epi8(in1, shufmask);
+    __m256i shuffled2 = _mm256_shuffle_epi8(in2, shufmask);
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(
-    const char16_t *buf, size_t len) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
-    return result(error_code::SUCCESS, 0);
-  }
-  result res = arm_validate_utf16_with_errors<endianness::BIG>(buf, len);
-  if (res.count != len) {
-    result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(
-        buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
-  }
-}
+    // move Latin1 bytes to their correct spot
+    __m256i idx1 = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, 4, 0);
+    __m256i idx2 = _mm256_set_epi32(-1, -1, -1, -1, 4, 0, -1, -1);
+    __m256i reshuffled1 = _mm256_permutevar8x32_epi32(shuffled1, idx1);
+    __m256i reshuffled2 = _mm256_permutevar8x32_epi32(shuffled2, idx2);
 
-simdutf_warn_unused bool
-implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
-    // empty input is valid. protected the implementation from nullptr.
-    return true;
-  }
-  const char32_t *tail = arm_validate_utf32le(buf, len);
-  if (tail) {
-    return scalar::utf32::validate(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
-}
+    __m256i result = _mm256_or_si256(reshuffled1, reshuffled2);
+    _mm_storeu_si128((__m128i *)latin1_output, _mm256_castsi256_si128(result));
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(
-    const char32_t *buf, size_t len) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
-    return result(error_code::SUCCESS, 0);
-  }
-  result res = arm_validate_utf32le_with_errors(buf, len);
-  if (res.count != len) {
-    result scalar_res =
-        scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
+    latin1_output += 16;
+    buf += 16;
   }
+
+  return std::make_pair(buf, latin1_output);
 }
+std::pair<result, char *>
+avx2_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+                                         char *latin1_output) {
+  const size_t rounded_len =
+      len & ~0x1F; // Round down to nearest multiple of 32
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
-    const char *buf, size_t len, char *utf8_output) const noexcept {
-  std::pair<const char *, char *> ret =
-      arm_convert_latin1_to_utf8(buf, len, utf8_output);
-  size_t converted_chars = ret.second - utf8_output;
+  __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00);
+  __m256i shufmask = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                     -1, 12, 8, 4, 0, -1, -1, -1, -1, -1, -1,
+                                     -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
 
-  if (ret.first != buf + len) {
-    const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
-        ret.first, len - (ret.first - buf), ret.second);
-    converted_chars += scalar_converted_chars;
-  }
-  return converted_chars;
-}
+  const char32_t *start = buf;
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  std::pair<const char *, char16_t *> ret =
-      arm_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-  size_t converted_chars = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_converted_chars =
-        scalar::latin1_to_utf16::convert<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-    converted_chars += scalar_converted_chars;
-  }
-  return converted_chars;
-}
+  for (size_t i = 0; i < rounded_len; i += 16) {
+    __m256i in1 = _mm256_loadu_si256((__m256i *)buf);
+    __m256i in2 = _mm256_loadu_si256((__m256i *)(buf + 8));
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  std::pair<const char *, char16_t *> ret =
-      arm_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
-  size_t converted_chars = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_converted_chars =
-        scalar::latin1_to_utf16::convert<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-    converted_chars += scalar_converted_chars;
-  }
-  return converted_chars;
-}
+    __m256i check_combined = _mm256_or_si256(in1, in2);
+
+    if (!_mm256_testz_si256(check_combined, high_bytes_mask)) {
+      // Fallback to scalar code for handling errors
+      for (int k = 0; k < 8; k++) {
+        char32_t codepoint = buf[k];
+        if (codepoint <= 0xFF) {
+          *latin1_output++ = static_cast<char>(codepoint);
+        } else {
+          return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
+                                latin1_output);
+        }
+      }
+      buf += 8;
+    } else {
+      __m256i shuffled1 = _mm256_shuffle_epi8(in1, shufmask);
+      __m256i shuffled2 = _mm256_shuffle_epi8(in2, shufmask);
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
-    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
-  std::pair<const char *, char32_t *> ret =
-      arm_convert_latin1_to_utf32(buf, len, utf32_output);
-  size_t converted_chars = ret.second - utf32_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
-        ret.first, len - (ret.first - buf), ret.second);
-    converted_chars += scalar_converted_chars;
+      __m256i idx1 = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, 4, 0);
+      __m256i idx2 = _mm256_set_epi32(-1, -1, -1, -1, 4, 0, -1, -1);
+      __m256i reshuffled1 = _mm256_permutevar8x32_epi32(shuffled1, idx1);
+      __m256i reshuffled2 = _mm256_permutevar8x32_epi32(shuffled2, idx2);
+
+      __m256i result = _mm256_or_si256(reshuffled1, reshuffled2);
+      _mm_storeu_si128((__m128i *)latin1_output,
+                       _mm256_castsi256_si128(result));
+
+      latin1_output += 16;
+      buf += 16;
+    }
   }
-  return converted_chars;
-}
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
-    const char *buf, size_t len, char *latin1_output) const noexcept {
-  utf8_to_latin1::validating_transcoder converter;
-  return converter.convert(buf, len, latin1_output);
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        latin1_output);
 }
+/* end file src/haswell/avx2_convert_utf32_to_latin1.cpp */
+/* begin file src/haswell/avx2_convert_utf32_to_utf8.cpp */
+std::pair<const char32_t *, char *>
+avx2_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
+  const char32_t *end = buf + len;
+  const __m256i v_0000 = _mm256_setzero_si256();
+  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+  __m256i running_max = _mm256_setzero_si256();
+  __m256i forbidden_bytemask = _mm256_setzero_si256();
 
-simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
-    const char *buf, size_t len, char *latin1_output) const noexcept {
-  utf8_to_latin1::validating_transcoder converter;
-  return converter.convert_with_errors(buf, len, latin1_output);
-}
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
-    const char *buf, size_t len, char *latin1_output) const noexcept {
-  return arm64::utf8_to_latin1::convert_valid(buf, len, latin1_output);
-}
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+    __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
+    running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
-}
+    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
+    // saturation
+    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
+                                        _mm256_and_si256(nextin, v_7fffffff));
+    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert<endianness::BIG>(buf, len, utf16_output);
-}
+    // Try to apply UTF-16 => UTF-8 routine on 256 bits
+    // (haswell/avx2_convert_utf16_to_utf8.cpp)
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert_with_errors<endianness::LITTLE>(buf, len,
-                                                           utf16_output);
-}
+    if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+      // 1. pack the bytes
+      const __m128i utf8_packed = _mm_packus_epi16(
+          _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+      // 2. store (16 bytes)
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+      // 3. adjust pointers
+      buf += 16;
+      utf8_output += 16;
+      continue; // we are done for this round!
+    }
+    // no bits set above 7th bit
+    const __m256i one_byte_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+    const uint32_t one_byte_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
-}
+    // no bits set above 11th bit
+    const __m256i one_or_two_bytes_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+    const uint32_t one_or_two_bytes_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+    if (one_or_two_bytes_bitmask == 0xffffffff) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
-    const char *input, size_t size, char16_t *utf16_output) const noexcept {
-  return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,
-                                                          utf16_output);
-}
+      // t0 = [000a|aaaa|bbbb|bb00]
+      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+      // t2 = [0000|0000|00bb|bbbb]
+      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+      // t3 = [000a|aaaa|00bb|bbbb]
+      const __m256i t3 = _mm256_or_si256(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      const __m256i t4 = _mm256_or_si256(t3, v_c080);
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
-    const char *input, size_t size, char16_t *utf16_output) const noexcept {
-  return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,
-                                                       utf16_output);
-}
+      // 2. merge ASCII and 2-byte codewords
+      const __m256i utf8_unpacked =
+          _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
-    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
-  utf8_to_utf32::validating_transcoder converter;
-  return converter.convert(buf, len, utf32_output);
-}
+      // 3. prepare bitmask for 8-bit lookup
+      const uint32_t M0 = one_byte_bitmask & 0x55555555;
+      const uint32_t M1 = M0 >> 7;
+      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+      // 4. pack the bytes
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
-    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
-  utf8_to_utf32::validating_transcoder converter;
-  return converter.convert_with_errors(buf, len, utf32_output);
-}
+      const uint8_t *row =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+      const uint8_t *row_2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
+                                                                       16)][0];
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
-    const char *input, size_t size, char32_t *utf32_output) const noexcept {
-  return utf8_to_utf32::convert_valid(input, size, utf32_output);
-}
+      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  std::pair<const char16_t *, char *> ret =
-      arm_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - latin1_output;
+      const __m256i utf8_packed = _mm256_shuffle_epi8(
+          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+      // 5. store bytes
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_castsi256_si128(utf8_packed));
+      utf8_output += row[0];
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_extractf128_si256(utf8_packed, 1));
+      utf8_output += row_2[0];
 
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf16_to_latin1::convert<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
+      // 6. adjust pointers
+      buf += 16;
+      continue;
     }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
+    // Must check for overflow in packing
+    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
+        _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+    const uint32_t saturation_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+    if (saturation_bitmask == 0xffffffff) {
+      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+      forbidden_bytemask = _mm256_or_si256(
+          forbidden_bytemask,
+          _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  std::pair<const char16_t *, char *> ret =
-      arm_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - latin1_output;
+      const __m256i dup_even = _mm256_setr_epi16(
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
 
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf16_to_latin1::convert<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
+      /* In this branch we handle three cases:
+        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+        single UFT-8 byte
+        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
+        UTF-8 bytes
+        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+        three UTF-8 bytes
 
-simdutf_warn_unused result
-implementation::convert_utf16le_to_latin1_with_errors(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  std::pair<result, char *> ret =
-      arm_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
-          buf, len, latin1_output);
-  if (ret.first.error) {
-    return ret.first;
-  } // Can return directly since scalar fallback already found correct
-    // ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res =
-        scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count =
-      ret.second -
-      latin1_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
+        We expand the input word (16-bit) into two code units (32-bit), thus
+        we have room for four bytes. However, we need five distinct bit
+        layouts. Note that the last byte in cases #2 and #3 is the same.
 
-simdutf_warn_unused result
-implementation::convert_utf16be_to_latin1_with_errors(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  std::pair<result, char *> ret =
-      arm_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
-                                                               latin1_output);
-  if (ret.first.error) {
-    return ret.first;
-  } // Can return directly since scalar fallback already found correct
-    // ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res =
-        scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count =
-      ret.second -
-      latin1_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
+        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+        in register t2.
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  // optimization opportunity: implement a custom function.
-  return convert_utf16be_to_latin1(buf, len, latin1_output);
-}
+        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+        either byte 1 for case #2 or byte 2 for case #3. Note that they
+        differ by exactly one bit.
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  // optimization opportunity: implement a custom function.
-  return convert_utf16le_to_latin1(buf, len, latin1_output);
-}
+        Finally from these two code units we build proper UTF-8 sequence, taking
+        into account the case (i.e, the number of bytes to write).
+      */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  std::pair<const char16_t *, char *> ret =
-      arm_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf16_to_utf8::convert<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
+      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
+                                             simdutf_vec(0b0100000000000000));
+      const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  std::pair<const char16_t *, char *> ret =
-      arm_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf16_to_utf8::convert<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
+      // 4. expand code units 16-bit => 32-bit
+      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of
-  // code units written even if finished
-  std::pair<result, char *> ret =
-      arm_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len,
-                                                                utf8_output);
-  if (ret.first.error) {
-    return ret.first;
-  } // Can return directly since scalar fallback already found correct
-    // ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res =
-        scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count =
-      ret.second -
-      utf8_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
+      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+      // Due to the wider registers, the following path is less likely to be
+      // useful.
+      /*if(mask == 0) {
+        // We only have three-byte code units. Use fast path.
+        const __m256i shuffle =
+      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
+      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
+      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
+      _mm256_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+        continue;
+      }*/
+      const uint8_t mask0 = uint8_t(mask);
+      const uint8_t *row0 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+      const __m128i utf8_0 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of
-  // code units written even if finished
-  std::pair<result, char *> ret =
-      arm_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len,
-                                                             utf8_output);
-  if (ret.first.error) {
-    return ret.first;
-  } // Can return directly since scalar fallback already found correct
-    // ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res =
-        scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
+      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+      const uint8_t *row1 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+      const __m128i utf8_1 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+      const uint8_t *row2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
+      const __m128i utf8_2 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+      const uint8_t *row3 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
+      const __m128i utf8_3 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+      utf8_output += row0[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+      utf8_output += row1[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
+      utf8_output += row2[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
+      utf8_output += row3[0];
+      buf += 16;
     } else {
-      ret.second += scalar_res.count;
+      // case: at least one 32-bit word is larger than 0xFFFF <=> it will
+      // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
+      // wasteful to use scalar code, but being efficient with SIMD may require
+      // large, non-trivial tables?
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+          *utf8_output++ = char(word);
+        } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(nullptr, utf8_output);
+          }
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else { // 4-byte
+          if (word > 0x10FFFF) {
+            return std::make_pair(nullptr, utf8_output);
+          }
+          *utf8_output++ = char((word >> 18) | 0b11110000);
+          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
     }
+  } // while
+
+  // check for invalid input
+  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+  if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(
+          _mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
+    return std::make_pair(nullptr, utf8_output);
   }
-  ret.first.count =
-      ret.second -
-      utf8_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return convert_utf16le_to_utf8(buf, len, utf8_output);
-}
+  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+    return std::make_pair(nullptr, utf8_output);
+  }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return convert_utf16be_to_utf8(buf, len, utf8_output);
+  return std::make_pair(buf, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
-    return 0;
-  }
-  std::pair<const char32_t *, char *> ret =
-      arm_convert_utf32_to_utf8(buf, len, utf8_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
-        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
+std::pair<result, char *>
+avx2_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
+                                       char *utf8_output) {
+  const char32_t *end = buf + len;
+  const char32_t *start = buf;
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
-    return result(error_code::SUCCESS, 0);
-  }
-  // ret.first.count is always the position in the buffer, not the number of
-  // code units written even if finished
-  std::pair<result, char *> ret =
-      arm_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
-        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count =
-      ret.second -
-      utf8_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
+  const __m256i v_0000 = _mm256_setzero_si256();
+  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  std::pair<const char16_t *, char32_t *> ret =
-      arm_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - utf32_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf16_to_utf32::convert<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  std::pair<const char16_t *, char32_t *> ret =
-      arm_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - utf32_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf16_to_utf32::convert<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+    __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
+    // Check for too large input
+    const __m256i max_input =
+        _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
+    if (static_cast<uint32_t>(_mm256_movemask_epi8(
+            _mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
+      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+                            utf8_output);
     }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of
-  // code units written even if finished
-  std::pair<result, char32_t *> ret =
-      arm_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len,
-                                                                 utf32_output);
-  if (ret.first.error) {
-    return ret.first;
-  } // Can return directly since scalar fallback already found correct
-    // ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res =
-        scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count =
-      ret.second -
-      utf32_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
+    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
+    // saturation
+    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
+                                        _mm256_and_si256(nextin, v_7fffffff));
+    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of
-  // code units written even if finished
-  std::pair<result, char32_t *> ret =
-      arm_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len,
-                                                              utf32_output);
-  if (ret.first.error) {
-    return ret.first;
-  } // Can return directly since scalar fallback already found correct
-    // ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res =
-        scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
+    // Try to apply UTF-16 => UTF-8 routine on 256 bits
+    // (haswell/avx2_convert_utf16_to_utf8.cpp)
+
+    if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+      // 1. pack the bytes
+      const __m128i utf8_packed = _mm_packus_epi16(
+          _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+      // 2. store (16 bytes)
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+      // 3. adjust pointers
+      buf += 16;
+      utf8_output += 16;
+      continue; // we are done for this round!
     }
-  }
-  ret.first.count =
-      ret.second -
-      utf32_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
+    // no bits set above 7th bit
+    const __m256i one_byte_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+    const uint32_t one_byte_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  std::pair<const char32_t *, char *> ret =
-      arm_convert_utf32_to_latin1(buf, len, latin1_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - latin1_output;
+    // no bits set above 11th bit
+    const __m256i one_or_two_bytes_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+    const uint32_t one_or_two_bytes_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+    if (one_or_two_bytes_bitmask == 0xffffffff) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
 
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
-        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
+      // t0 = [000a|aaaa|bbbb|bb00]
+      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+      // t2 = [0000|0000|00bb|bbbb]
+      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+      // t3 = [000a|aaaa|00bb|bbbb]
+      const __m256i t3 = _mm256_or_si256(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      const __m256i t4 = _mm256_or_si256(t3, v_c080);
 
-simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  std::pair<result, char *> ret =
-      arm_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
-  if (ret.first.error) {
-    return ret.first;
-  } // Can return directly since scalar fallback already found correct
-    // ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
-        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count =
-      ret.second -
-      latin1_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
+      // 2. merge ASCII and 2-byte codewords
+      const __m256i utf8_unpacked =
+          _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  std::pair<const char32_t *, char *> ret =
-      arm_convert_utf32_to_latin1(buf, len, latin1_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - latin1_output;
+      // 3. prepare bitmask for 8-bit lookup
+      const uint32_t M0 = one_byte_bitmask & 0x55555555;
+      const uint32_t M1 = M0 >> 7;
+      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+      // 4. pack the bytes
 
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert_valid(
-        ret.first, len - (ret.first - buf), ret.second);
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
+      const uint8_t *row =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+      const uint8_t *row_2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
+                                                                       16)][0];
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  // optimization opportunity: implement a custom function.
-  return convert_utf32_to_utf8(buf, len, utf8_output);
-}
+      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  std::pair<const char32_t *, char16_t *> ret =
-      arm_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf32_to_utf16::convert<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
+      const __m256i utf8_packed = _mm256_shuffle_epi8(
+          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+      // 5. store bytes
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_castsi256_si128(utf8_packed));
+      utf8_output += row[0];
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_extractf128_si256(utf8_packed, 1));
+      utf8_output += row_2[0];
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  std::pair<const char32_t *, char16_t *> ret =
-      arm_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf32_to_utf16::convert<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
+      // 6. adjust pointers
+      buf += 16;
+      continue;
     }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
+    // Must check for overflow in packing
+    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
+        _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+    const uint32_t saturation_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+    if (saturation_bitmask == 0xffffffff) {
+      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of
-  // code units written even if finished
-  std::pair<result, char16_t *> ret =
-      arm_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len,
-                                                                 utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res =
-        scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count =
-      ret.second -
-      utf16_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
+      // Check for illegal surrogate code units
+      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+      const __m256i forbidden_bytemask =
+          _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
+      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
+          0x0) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start),
+                              utf8_output);
+      }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of
-  // code units written even if finished
-  std::pair<result, char16_t *> ret =
-      arm_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len,
-                                                              utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res =
-        scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count =
-      ret.second -
-      utf16_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
+      const __m256i dup_even = _mm256_setr_epi16(
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return convert_utf32_to_utf16le(buf, len, utf16_output);
-}
+      /* In this branch we handle three cases:
+        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+        single UFT-8 byte
+        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
+        UTF-8 bytes
+        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+        three UTF-8 bytes
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return convert_utf32_to_utf16be(buf, len, utf16_output);
-}
+        We expand the input word (16-bit) into two code units (32-bit), thus
+        we have room for four bytes. However, we need five distinct bit
+        layouts. Note that the last byte in cases #2 and #3 is the same.
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return convert_utf16le_to_utf32(buf, len, utf32_output);
-}
+        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+        in register t2.
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return convert_utf16be_to_utf32(buf, len, utf32_output);
-}
+        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+        either byte 1 for case #2 or byte 2 for case #3. Note that they
+        differ by exactly one bit.
 
-void implementation::change_endianness_utf16(const char16_t *input,
-                                             size_t length,
-                                             char16_t *output) const noexcept {
-  utf16::change_endianness_utf16(input, length, output);
-}
+        Finally from these two code units we build proper UTF-8 sequence, taking
+        into account the case (i.e, the number of bytes to write).
+      */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
 
-simdutf_warn_unused size_t implementation::count_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  return utf16::count_code_points<endianness::LITTLE>(input, length);
-}
+      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
+                                             simdutf_vec(0b0100000000000000));
+      const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+      // 4. expand code units 16-bit => 32-bit
+      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+      // Due to the wider registers, the following path is less likely to be
+      // useful.
+      /*if(mask == 0) {
+        // We only have three-byte code units. Use fast path.
+        const __m256i shuffle =
+      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
+      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
+      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
+      _mm256_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+        continue;
+      }*/
+      const uint8_t mask0 = uint8_t(mask);
+      const uint8_t *row0 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+      const __m128i utf8_0 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
 
-simdutf_warn_unused size_t implementation::count_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  return utf16::count_code_points<endianness::BIG>(input, length);
-}
+      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+      const uint8_t *row1 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+      const __m128i utf8_1 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
 
-simdutf_warn_unused size_t
-implementation::count_utf8(const char *input, size_t length) const noexcept {
-  return utf8::count_code_points(input, length);
-}
+      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+      const uint8_t *row2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
+      const __m128i utf8_2 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
-    const char *buf, size_t len) const noexcept {
-  return count_utf8(buf, len);
-}
+      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+      const uint8_t *row3 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
+      const __m128i utf8_3 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
 
-simdutf_warn_unused size_t
-implementation::latin1_length_from_utf16(size_t length) const noexcept {
-  return scalar::utf16::latin1_length_from_utf16(length);
-}
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+      utf8_output += row0[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+      utf8_output += row1[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
+      utf8_output += row2[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
+      utf8_output += row3[0];
+      buf += 16;
+    } else {
+      // case: at least one 32-bit word is larger than 0xFFFF <=> it will
+      // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
+      // wasteful to use scalar code, but being efficient with SIMD may require
+      // large, non-trivial tables?
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+          *utf8_output++ = char(word);
+        } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k), utf8_output);
+          }
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else { // 4-byte
+          if (word > 0x10FFFF) {
+            return std::make_pair(
+                result(error_code::TOO_LARGE, buf - start + k), utf8_output);
+          }
+          *utf8_output++ = char((word >> 18) | 0b11110000);
+          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
+    }
+  } // while
 
-simdutf_warn_unused size_t
-implementation::latin1_length_from_utf32(size_t length) const noexcept {
-  return scalar::utf32::latin1_length_from_utf32(length);
+  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
+/* end file src/haswell/avx2_convert_utf32_to_utf8.cpp */
+/* begin file src/haswell/avx2_convert_utf32_to_utf16.cpp */
+template <endianness big_endian>
+std::pair<const char32_t *, char16_t *>
+avx2_convert_utf32_to_utf16(const char32_t *buf, size_t len,
+                            char16_t *utf16_output) {
+  const char32_t *end = buf + len;
 
-simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
-    const char *input, size_t length) const noexcept {
-  // See
-  // https://lemire.me/blog/2023/05/15/computing-the-utf-8-size-of-a-latin-1-string-quickly-arm-neon-edition/
-  // credit to Pete Cawley
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
-  uint64_t result = 0;
-  const int lanes = sizeof(uint8x16_t);
-  uint8_t rem = length % lanes;
-  const uint8_t *simd_end = data + (length / lanes) * lanes;
-  const uint8x16_t threshold = vdupq_n_u8(0x80);
-  for (; data < simd_end; data += lanes) {
-    // load 16 bytes
-    uint8x16_t input_vec = vld1q_u8(data);
-    // compare to threshold (0x80)
-    uint8x16_t withhighbit = vcgeq_u8(input_vec, threshold);
-    // vertical addition
-    result -= vaddvq_s8(vreinterpretq_s8_u8(withhighbit));
-  }
-  return result + (length / lanes) * lanes +
-         scalar::latin1::utf8_length_from_latin1((const char *)simd_end, rem);
-}
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+  __m256i forbidden_bytemask = _mm256_setzero_si256();
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
-}
+  while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
-}
+    const __m256i v_00000000 = _mm256_setzero_si256();
+    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
 
-simdutf_warn_unused size_t
-implementation::utf16_length_from_latin1(size_t length) const noexcept {
-  return scalar::latin1::utf16_length_from_latin1(length);
-}
+    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+    const __m256i saturation_bytemask =
+        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+    const uint32_t saturation_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
 
-simdutf_warn_unused size_t
-implementation::utf32_length_from_latin1(size_t length) const noexcept {
-  return scalar::latin1::utf32_length_from_latin1(length);
-}
+    if (saturation_bitmask == 0xffffffff) {
+      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+      forbidden_bytemask = _mm256_or_si256(
+          forbidden_bytemask,
+          _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
-}
+      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
+                                              _mm256_extractf128_si256(in, 1));
+      if (big_endian) {
+        const __m128i swap =
+            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+      }
+      _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
+      utf16_output += 8;
+      buf += 8;
+    } else {
+      size_t forward = 7;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFF0000) == 0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(nullptr, utf16_output);
+          }
+          *utf16_output++ =
+              big_endian
+                  ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
+                  : char16_t(word);
+        } else {
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) {
+            return std::make_pair(nullptr, utf16_output);
+          }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (big_endian) {
+            high_surrogate =
+                uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+            low_surrogate =
+                uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
+        }
+      }
+      buf += k;
+    }
+  }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
-}
+  // check for invalid input
+  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+    return std::make_pair(nullptr, utf16_output);
+  }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
-    const char *input, size_t length) const noexcept {
-  return utf8::utf16_length_from_utf8(input, length);
+  return std::make_pair(buf, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
-    const char32_t *input, size_t length) const noexcept {
-  const uint32x4_t v_7f = vmovq_n_u32((uint32_t)0x7f);
-  const uint32x4_t v_7ff = vmovq_n_u32((uint32_t)0x7ff);
-  const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
-  const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
-  size_t pos = 0;
-  size_t count = 0;
-  for (; pos + 4 <= length; pos += 4) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
-    const uint32x4_t ascii_bytes_bytemask = vcleq_u32(in, v_7f);
-    const uint32x4_t one_two_bytes_bytemask = vcleq_u32(in, v_7ff);
-    const uint32x4_t two_bytes_bytemask =
-        veorq_u32(one_two_bytes_bytemask, ascii_bytes_bytemask);
-    const uint32x4_t three_bytes_bytemask =
-        veorq_u32(vcleq_u32(in, v_ffff), one_two_bytes_bytemask);
+template <endianness big_endian>
+std::pair<result, char16_t *>
+avx2_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
+                                        char16_t *utf16_output) {
+  const char32_t *start = buf;
+  const char32_t *end = buf + len;
 
-    const uint16x8_t reduced_ascii_bytes_bytemask =
-        vreinterpretq_u16_u32(vandq_u32(ascii_bytes_bytemask, v_1));
-    const uint16x8_t reduced_two_bytes_bytemask =
-        vreinterpretq_u16_u32(vandq_u32(two_bytes_bytemask, v_1));
-    const uint16x8_t reduced_three_bytes_bytemask =
-        vreinterpretq_u16_u32(vandq_u32(three_bytes_bytemask, v_1));
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
 
-    const uint16x8_t compressed_bytemask0 =
-        vpaddq_u16(reduced_ascii_bytes_bytemask, reduced_two_bytes_bytemask);
-    const uint16x8_t compressed_bytemask1 =
-        vpaddq_u16(reduced_three_bytes_bytemask, reduced_three_bytes_bytemask);
+  while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
 
-    size_t ascii_count = count_ones(
-        vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 0));
-    size_t two_bytes_count = count_ones(
-        vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 1));
-    size_t three_bytes_count = count_ones(
-        vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask1), 0));
+    const __m256i v_00000000 = _mm256_setzero_si256();
+    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
 
-    count += 16 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
-  }
-  return count +
-         scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
-}
+    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+    const __m256i saturation_bytemask =
+        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+    const uint32_t saturation_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
-    const char32_t *input, size_t length) const noexcept {
-  const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
-  const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
-  size_t pos = 0;
-  size_t count = 0;
-  for (; pos + 4 <= length; pos += 4) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
-    const uint32x4_t surrogate_bytemask = vcgtq_u32(in, v_ffff);
-    const uint16x8_t reduced_bytemask =
-        vreinterpretq_u16_u32(vandq_u32(surrogate_bytemask, v_1));
-    const uint16x8_t compressed_bytemask =
-        vpaddq_u16(reduced_bytemask, reduced_bytemask);
-    size_t surrogate_count = count_ones(
-        vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask), 0));
-    count += 4 + surrogate_count;
+    if (saturation_bitmask == 0xffffffff) {
+      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+      const __m256i forbidden_bytemask =
+          _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
+      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
+          0x0) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start),
+                              utf16_output);
+      }
+
+      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
+                                              _mm256_extractf128_si256(in, 1));
+      if (big_endian) {
+        const __m128i swap =
+            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+      }
+      _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
+      utf16_output += 8;
+      buf += 8;
+    } else {
+      size_t forward = 7;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFF0000) == 0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k), utf16_output);
+          }
+          *utf16_output++ =
+              big_endian
+                  ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
+                  : char16_t(word);
+        } else {
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) {
+            return std::make_pair(
+                result(error_code::TOO_LARGE, buf - start + k), utf16_output);
+          }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (big_endian) {
+            high_surrogate =
+                uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+            low_surrogate =
+                uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
+        }
+      }
+      buf += k;
+    }
   }
-  return count +
-         scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
-}
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
-    const char *input, size_t length) const noexcept {
-  return utf8::count_code_points(input, length);
+  return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
 }
+/* end file src/haswell/avx2_convert_utf32_to_utf16.cpp */
 
-simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
-    const char *input, size_t length) const noexcept {
-  return scalar::base64::maximal_binary_length_from_base64(input, length);
-}
+/* begin file src/haswell/avx2_convert_utf8_to_latin1.cpp */
+// depends on "tables/utf8_to_utf16_tables.h"
 
-simdutf_warn_unused result implementation::base64_to_binary(
-    const char *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  return (options & base64_url)
-             ? compress_decode_base64<true>(output, input, length, options,
-                                            last_chunk_options)
-             : compress_decode_base64<false>(output, input, length, options,
-                                             last_chunk_options);
-}
+// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+size_t convert_masked_utf8_to_latin1(const char *input,
+                                     uint64_t utf8_end_of_code_point_mask,
+                                     char *&latin1_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it
+  // is maybe beneficial to have fast paths that depend on branch prediction but
+  // have less latency. This results in more instructions but, potentially, also
+  // higher speeds.
+  //
+  const __m128i in = _mm_loadu_si128((__m128i *)input);
 
-simdutf_warn_unused full_result implementation::base64_to_binary_details(
-    const char *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  return (options & base64_url)
-             ? compress_decode_base64<true>(output, input, length, options,
-                                            last_chunk_options)
-             : compress_decode_base64<false>(output, input, length, options,
-                                             last_chunk_options);
-}
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask &
+      0xfff; // we are only processing 12 bytes in case it is not all ASCII
 
-simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::base64::maximal_binary_length_from_base64(input, length);
+  if (utf8_end_of_code_point_mask == 0xfff) {
+    // We process the data in chunks of 12 bytes.
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in);
+    latin1_output += 12; // We wrote 12 characters.
+    return 12;           // We consumed 1 bytes.
+  }
+  /// We do not have a fast path available, so we fallback.
+  const uint8_t idx =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+  // this indicates an invalid input:
+  if (idx >= 64) {
+    return consumed;
+  }
+  // Here we should have (idx < 64), if not, there is a bug in the validation or
+  // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
+  // we process SIX (6) input code-code units. The max length in bytes of six
+  // code code units spanning between 1 and 2 bytes each is 12 bytes. On
+  // processors where pdep/pext is fast, we might be able to use a small lookup
+  // table.
+  const __m128i sh =
+      _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+  const __m128i perm = _mm_shuffle_epi8(in, sh);
+  const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+  const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+  __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+  const __m128i latin1_packed = _mm_packus_epi16(composed, composed);
+  // writing 8 bytes even though we only care about the first 6 bytes.
+  // performance note: it would be faster to use _mm_storeu_si128, we should
+  // investigate.
+  _mm_storel_epi64((__m128i *)latin1_output, latin1_packed);
+  latin1_output += 6; // We wrote 6 bytes.
+  return consumed;
 }
+/* end file src/haswell/avx2_convert_utf8_to_latin1.cpp */
 
-simdutf_warn_unused result implementation::base64_to_binary(
-    const char16_t *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  return (options & base64_url)
-             ? compress_decode_base64<true>(output, input, length, options,
-                                            last_chunk_options)
-             : compress_decode_base64<false>(output, input, length, options,
-                                             last_chunk_options);
-}
+/* begin file src/haswell/avx2_base64.cpp */
+/**
+ * References and further reading:
+ *
+ * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
+ * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
+ * https://arxiv.org/abs/1910.05109
+ *
+ * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
+ * Instructions, ACM Transactions on the Web 12 (3), 2018.
+ * https://arxiv.org/abs/1704.00605
+ *
+ * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
+ * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
+ * Request for Comments: 4648.
+ *
+ * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
+ * http://www.alfredklomp.com/programming/sse-base64/. (2014).
+ *
+ * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
+ * acceleration. https://github.com/aklomp/base64. (2014).
+ *
+ * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
+ * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
+ *
+ * Nick Kopp. 2013. Base64 Encoding on a GPU.
+ * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
+ */
 
-simdutf_warn_unused full_result implementation::base64_to_binary_details(
-    const char16_t *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  return (options & base64_url)
-             ? compress_decode_base64<true>(output, input, length, options,
-                                            last_chunk_options)
-             : compress_decode_base64<false>(output, input, length, options,
-                                             last_chunk_options);
-}
+template <bool base64_url>
+simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) {
+  // credit: Wojciech Muła
+  __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
+  const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
+  result =
+      _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
+  __m256i shift_LUT;
+  if (base64_url) {
+    shift_LUT = _mm256_setr_epi8(
+        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0,
 
-simdutf_warn_unused size_t implementation::base64_length_from_binary(
-    size_t length, base64_options options) const noexcept {
-  return scalar::base64::base64_length_from_binary(length, options);
-}
+        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
+  } else {
+    shift_LUT = _mm256_setr_epi8(
+        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0,
 
-size_t implementation::binary_to_base64(const char *input, size_t length,
-                                        char *output,
-                                        base64_options options) const noexcept {
-  return encode_base64(output, input, length, options);
+        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
+  }
+
+  result = _mm256_shuffle_epi8(shift_LUT, result);
+  return _mm256_add_epi8(result, input);
 }
 
-} // namespace arm64
-} // namespace simdutf
+template <bool isbase64url>
+size_t encode_base64(char *dst, const char *src, size_t srclen,
+                     base64_options options) {
+  // credit: Wojciech Muła
+  const uint8_t *input = (const uint8_t *)src;
 
-/* begin file src/simdutf/arm64/end.h */
-/* end file src/simdutf/arm64/end.h */
-/* end file src/arm64/implementation.cpp */
-#endif
-#if SIMDUTF_IMPLEMENTATION_FALLBACK
-/* begin file src/fallback/implementation.cpp */
-/* begin file src/simdutf/fallback/begin.h */
-// redefining SIMDUTF_IMPLEMENTATION to "fallback"
-// #define SIMDUTF_IMPLEMENTATION fallback
-/* end file src/simdutf/fallback/begin.h */
+  uint8_t *out = (uint8_t *)dst;
+  const __m256i shuf =
+      _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
 
+                      10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
+  size_t i = 0;
+  for (; i + 100 <= srclen; i += 96) {
+    const __m128i lo0 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
+    const __m128i hi0 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
+    const __m128i lo1 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
+    const __m128i hi1 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
+    const __m128i lo2 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4));
+    const __m128i hi2 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5));
+    const __m128i lo3 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6));
+    const __m128i hi3 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7));
 
+    __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
+    __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
+    __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
+    __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
 
+    const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
+    const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
+    const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
+    const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
 
+    const __m256i t1_0 =
+        _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
+    const __m256i t1_1 =
+        _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
+    const __m256i t1_2 =
+        _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
+    const __m256i t1_3 =
+        _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
 
+    const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
+    const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
+    const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
+    const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
 
+    const __m256i t3_0 =
+        _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
+    const __m256i t3_1 =
+        _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
+    const __m256i t3_2 =
+        _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
+    const __m256i t3_3 =
+        _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
 
+    const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
+    const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
+    const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
+    const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
 
-#include <cstdint>
-#include <cstring>
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+                        lookup_pshufb_improved<isbase64url>(input0));
+    out += 32;
 
-namespace simdutf {
-namespace fallback {
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+                        lookup_pshufb_improved<isbase64url>(input1));
+    out += 32;
 
-simdutf_warn_unused int
-implementation::detect_encodings(const char *input,
-                                 size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  if (bom_encoding != encoding_type::unspecified) {
-    return bom_encoding;
-  }
-  // todo: reimplement as a one-pass algorithm.
-  int out = 0;
-  if (validate_utf8(input, length)) {
-    out |= encoding_type::UTF8;
-  }
-  if ((length % 2) == 0) {
-    if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
-                         length / 2)) {
-      out |= encoding_type::UTF16_LE;
-    }
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+                        lookup_pshufb_improved<isbase64url>(input2));
+    out += 32;
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+                        lookup_pshufb_improved<isbase64url>(input3));
+    out += 32;
   }
-  if ((length % 4) == 0) {
-    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
-      out |= encoding_type::UTF32_LE;
-    }
+  for (; i + 28 <= srclen; i += 24) {
+    // lo = [xxxx|DDDC|CCBB|BAAA]
+    // hi = [xxxx|HHHG|GGFF|FEEE]
+    const __m128i lo =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
+    const __m128i hi =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3));
+
+    // bytes from groups A, B and C are needed in separate 32-bit lanes
+    // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
+    __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);
+
+    // this part is well commented in encode.sse.cpp
+
+    const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
+    const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
+    const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
+    const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
+    const __m256i indices = _mm256_or_si256(t1, t3);
+
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+                        lookup_pshufb_improved<isbase64url>(indices));
+    out += 32;
   }
-  return out;
+  return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
+                                                        srclen - i, options);
 }
 
-simdutf_warn_unused bool
-implementation::validate_utf8(const char *buf, size_t len) const noexcept {
-  return scalar::utf8::validate(buf, len);
-}
+static inline void compress(__m128i data, uint16_t mask, char *output) {
+  if (mask == 0) {
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data);
+    return;
+  }
+  // this particular implementation was inspired by work done by @animetosho
+  // we do it in two steps, first 8 bytes and then second 8 bytes
+  uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
+  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
+  // next line just loads the 64-bit values thintable_epi8[mask1] and
+  // thintable_epi8[mask2] into a 128-bit register, using only
+  // two instructions on most compilers.
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(
-    const char *buf, size_t len) const noexcept {
-  return scalar::utf8::validate_with_errors(buf, len);
-}
+  __m128i shufmask = _mm_set_epi64x(tables::base64::thintable_epi8[mask2],
+                                    tables::base64::thintable_epi8[mask1]);
+  // we increment by 0x08 the second half of the mask
+  shufmask =
+      _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
+  // this is the version "nearly pruned"
+  __m128i pruned = _mm_shuffle_epi8(data, shufmask);
+  // we still need to put the two halves together.
+  // we compute the popcount of the first half:
+  int pop1 = tables::base64::BitsSetTable256mul2[mask1];
+  // then load the corresponding mask, what it does is to write
+  // only the first pop1 bytes from the first 8 bytes, and then
+  // it fills in with the bytes from the second 8 bytes + some filling
+  // at the end.
+  __m128i compactmask = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
+      tables::base64::pshufb_combine_table + pop1 * 8));
+  __m128i answer = _mm_shuffle_epi8(pruned, compactmask);
 
-simdutf_warn_unused bool
-implementation::validate_ascii(const char *buf, size_t len) const noexcept {
-  return scalar::ascii::validate(buf, len);
+  _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
 }
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(
-    const char *buf, size_t len) const noexcept {
-  return scalar::ascii::validate_with_errors(buf, len);
+static inline void compress(__m256i data, uint32_t mask, char *output) {
+  if (mask == 0) {
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), data);
+    return;
+  }
+  compress(_mm256_castsi256_si128(data), uint16_t(mask), output);
+  compress(_mm256_extracti128_si256(data, 1), uint16_t(mask >> 16),
+           output + _mm_popcnt_u32(~mask & 0xFFFF));
 }
 
-simdutf_warn_unused bool
-implementation::validate_utf16le(const char16_t *buf,
-                                 size_t len) const noexcept {
-  return scalar::utf16::validate<endianness::LITTLE>(buf, len);
-}
+struct block64 {
+  __m256i chunks[2];
+};
 
-simdutf_warn_unused bool
-implementation::validate_utf16be(const char16_t *buf,
-                                 size_t len) const noexcept {
-  return scalar::utf16::validate<endianness::BIG>(buf, len);
-}
+template <bool base64_url>
+static inline uint32_t to_base64_mask(__m256i *src, uint32_t *error) {
+  const __m256i ascii_space_tbl =
+      _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa,
+                       0x0, 0xc, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0,
+                       0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0);
+  // credit: aqrit
+  __m256i delta_asso;
+  if (base64_url) {
+    delta_asso =
+        _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0,
+                         0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+                         0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF);
+  } else {
+    delta_asso = _mm256_setr_epi8(
+        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
+  }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(
-    const char16_t *buf, size_t len) const noexcept {
-  return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
-}
+  __m256i delta_values;
+  if (base64_url) {
+    delta_values = _mm256_setr_epi8(
+        0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
+        uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0),
+        uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
+        uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3),
+        uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
+  } else {
+    delta_values = _mm256_setr_epi8(
+        int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
+        int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00),
+        int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+        int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+        int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9),
+        int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF),
+        int8_t(0xB9), int8_t(0xB9));
+  }
+  __m256i check_asso;
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(
-    const char16_t *buf, size_t len) const noexcept {
-  return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
-}
+  if (base64_url) {
+    check_asso =
+        _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3,
+                         0x7, 0xB, 0xE, 0xB, 0x6, 0xD, 0x1, 0x1, 0x1, 0x1, 0x1,
+                         0x1, 0x1, 0x1, 0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6);
+  } else {
 
-simdutf_warn_unused bool
-implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
-  return scalar::utf32::validate(buf, len);
+    check_asso = _mm256_setr_epi8(
+        0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
+        0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+        0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
+  }
+  __m256i check_values;
+  if (base64_url) {
+    check_values = _mm256_setr_epi8(
+        uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
+        uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), uint8_t(0xA6),
+        uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, uint8_t(0x80),
+        0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
+        uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6),
+        uint8_t(0xA6), uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0,
+        uint8_t(0x80), 0x0, uint8_t(0x80));
+  } else {
+    check_values = _mm256_setr_epi8(
+        int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
+        int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86),
+        int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91),
+        int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+        int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5),
+        int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80),
+        int8_t(0x91), int8_t(0x80));
+  }
+  const __m256i shifted = _mm256_srli_epi32(*src, 3);
+  const __m256i delta_hash =
+      _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted);
+  const __m256i check_hash =
+      _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted);
+  const __m256i out =
+      _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src);
+  const __m256i chk =
+      _mm256_adds_epi8(_mm256_shuffle_epi8(check_values, check_hash), *src);
+  const int mask = _mm256_movemask_epi8(chk);
+  if (mask) {
+    __m256i ascii_space =
+        _mm256_cmpeq_epi8(_mm256_shuffle_epi8(ascii_space_tbl, *src), *src);
+    *error = (mask ^ _mm256_movemask_epi8(ascii_space));
+  }
+  *src = out;
+  return (uint32_t)mask;
 }
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(
-    const char32_t *buf, size_t len) const noexcept {
-  return scalar::utf32::validate_with_errors(buf, len);
+template <bool base64_url>
+static inline uint64_t to_base64_mask(block64 *b, uint64_t *error) {
+  uint32_t err0 = 0;
+  uint32_t err1 = 0;
+  uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], &err0);
+  uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], &err1);
+  *error = err0 | ((uint64_t)err1 << 32);
+  return m0 | (m1 << 32);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
-    const char *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
+static inline void copy_block(block64 *b, char *output) {
+  _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), b->chunks[0]);
+  _mm256_storeu_si256(reinterpret_cast<__m256i *>(output + 32), b->chunks[1]);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len,
-                                                              utf16_output);
+static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
+  uint64_t nmask = ~mask;
+  compress(b->chunks[0], uint32_t(mask), output);
+  compress(b->chunks[1], uint32_t(mask >> 32),
+           output + _mm_popcnt_u64(nmask & 0xFFFFFFFF));
+  return _mm_popcnt_u64(nmask);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len,
-                                                           utf16_output);
+// The caller of this function is responsible to ensure that there are 64 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char *src) {
+  b->chunks[0] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
+  b->chunks[1] =
+      _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
-    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::latin1_to_utf32::convert(buf, len, utf32_output);
+// The caller of this function is responsible to ensure that there are 128 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char16_t *src) {
+  __m256i m1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
+  __m256i m2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 16));
+  __m256i m3 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
+  __m256i m4 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 48));
+  __m256i m1p = _mm256_permute2x128_si256(m1, m2, 0x20);
+  __m256i m2p = _mm256_permute2x128_si256(m1, m2, 0x31);
+  __m256i m3p = _mm256_permute2x128_si256(m3, m4, 0x20);
+  __m256i m4p = _mm256_permute2x128_si256(m3, m4, 0x31);
+  b->chunks[0] = _mm256_packus_epi16(m1p, m2p);
+  b->chunks[1] = _mm256_packus_epi16(m3p, m4p);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
-    const char *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
-}
+static inline void base64_decode(char *out, __m256i str) {
+  // credit: aqrit
+  const __m256i pack_shuffle =
+      _mm256_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
+                       2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1);
+  const __m256i t0 = _mm256_maddubs_epi16(str, _mm256_set1_epi32(0x01400140));
+  const __m256i t1 = _mm256_madd_epi16(t0, _mm256_set1_epi32(0x00011000));
+  const __m256i t2 = _mm256_shuffle_epi8(t1, pack_shuffle);
 
-simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
-    const char *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
+  // Store the output:
+  _mm_storeu_si128((__m128i *)out, _mm256_castsi256_si128(t2));
+  _mm_storeu_si128((__m128i *)(out + 12), _mm256_extracti128_si256(t2, 1));
 }
-
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
-    const char *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+// decode 64 bytes and output 48 bytes
+static inline void base64_decode_block(char *out, const char *src) {
+  base64_decode(out,
+                _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)));
+  base64_decode(out + 24, _mm256_loadu_si256(
+                              reinterpret_cast<const __m256i *>(src + 32)));
 }
-
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf8_to_utf16::convert<endianness::LITTLE>(buf, len,
-                                                            utf16_output);
+static inline void base64_decode_block_safe(char *out, const char *src) {
+  base64_decode(out,
+                _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)));
+  char buffer[32]; // We enforce safety with a buffer.
+  base64_decode(
+      buffer, _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32)));
+  std::memcpy(out + 24, buffer, 24);
 }
-
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf8_to_utf16::convert<endianness::BIG>(buf, len,
-                                                         utf16_output);
+static inline void base64_decode_block(char *out, block64 *b) {
+  base64_decode(out, b->chunks[0]);
+  base64_decode(out + 24, b->chunks[1]);
 }
-
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(
-      buf, len, utf16_output);
+static inline void base64_decode_block_safe(char *out, block64 *b) {
+  base64_decode(out, b->chunks[0]);
+  char buffer[32]; // We enforce safety with a buffer.
+  base64_decode(buffer, b->chunks[1]);
+  std::memcpy(out + 24, buffer, 24);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(
-      buf, len, utf16_output);
-}
+template <bool base64_url, typename chartype>
+full_result
+compress_decode_base64(char *dst, const chartype *src, size_t srclen,
+                       base64_options options,
+                       last_chunk_handling_options last_chunk_options) {
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+                                        : tables::base64::to_base64_value;
+  size_t equallocation =
+      srclen; // location of the first padding character if any
+  // skip trailing spaces
+  while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+         to_base64[uint8_t(src[srclen - 1])] == 64) {
+    srclen--;
+  }
+  size_t equalsigns = 0;
+  if (srclen > 0 && src[srclen - 1] == '=') {
+    equallocation = srclen - 1;
+    srclen--;
+    equalsigns = 1;
+    // skip trailing spaces
+    while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+           to_base64[uint8_t(src[srclen - 1])] == 64) {
+      srclen--;
+    }
+    if (srclen > 0 && src[srclen - 1] == '=') {
+      equallocation = srclen - 1;
+      srclen--;
+      equalsigns = 2;
+    }
+  }
+  if (srclen == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation, 0};
+    }
+    return {SUCCESS, 0, 0};
+  }
+  char *end_of_safe_64byte_zone =
+      (srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst;
+
+  const chartype *const srcinit = src;
+  const char *const dstinit = dst;
+  const chartype *const srcend = src + srclen;
+
+  constexpr size_t block_size = 6;
+  static_assert(block_size >= 2, "block_size must be at least two");
+  char buffer[block_size * 64];
+  char *bufferptr = buffer;
+  if (srclen >= 64) {
+    const chartype *const srcend64 = src + srclen - 64;
+    while (src <= srcend64) {
+      block64 b;
+      load_block(&b, src);
+      src += 64;
+      uint64_t error = 0;
+      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
+      if (error) {
+        src -= 64;
+        size_t error_offset = _tzcnt_u64(error);
+        return {error_code::INVALID_BASE64_CHARACTER,
+                size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
+      }
+      if (badcharmask != 0) {
+        // optimization opportunity: check for simple masks like those made of
+        // continuous 1s followed by continuous 0s. And masks containing a
+        // single bad character.
+        bufferptr += compress_block(&b, badcharmask, bufferptr);
+      } else if (bufferptr != buffer) {
+        copy_block(&b, bufferptr);
+        bufferptr += 64;
+      } else {
+        if (dst >= end_of_safe_64byte_zone) {
+          base64_decode_block_safe(dst, &b);
+        } else {
+          base64_decode_block(dst, &b);
+        }
+        dst += 48;
+      }
+      if (bufferptr >= (block_size - 1) * 64 + buffer) {
+        for (size_t i = 0; i < (block_size - 2); i++) {
+          base64_decode_block(dst, buffer + i * 64);
+          dst += 48;
+        }
+        if (dst >= end_of_safe_64byte_zone) {
+          base64_decode_block_safe(dst, buffer + (block_size - 2) * 64);
+        } else {
+          base64_decode_block(dst, buffer + (block_size - 2) * 64);
+        }
+        dst += 48;
+        std::memcpy(buffer, buffer + (block_size - 1) * 64,
+                    64); // 64 might be too much
+        bufferptr -= (block_size - 1) * 64;
+      }
+    }
+  }
+
+  char *buffer_start = buffer;
+  // Optimization note: if this is almost full, then it is worth our
+  // time, otherwise, we should just decode directly.
+  int last_block = (int)((bufferptr - buffer_start) % 64);
+  if (last_block != 0 && srcend - src + last_block >= 64) {
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(buf, len,
-                                                                  utf16_output);
-}
+    while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
+      uint8_t val = to_base64[uint8_t(*src)];
+      *bufferptr = char(val);
+      if (!scalar::base64::is_eight_byte(*src) || val > 64) {
+        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+                size_t(dst - dstinit)};
+      }
+      bufferptr += (val <= 63);
+      src++;
+    }
+  }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf8_to_utf16::convert_valid<endianness::BIG>(buf, len,
-                                                               utf16_output);
-}
+  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
+    if (dst >= end_of_safe_64byte_zone) {
+      base64_decode_block_safe(dst, buffer_start);
+    } else {
+      base64_decode_block(dst, buffer_start);
+    }
+    dst += 48;
+  }
+  if ((bufferptr - buffer_start) % 64 != 0) {
+    while (buffer_start + 4 < bufferptr) {
+      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                        << 8;
+      triple = scalar::utf32::swap_bytes(triple);
+      std::memcpy(dst, &triple, 4);
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
-    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf8_to_utf32::convert(buf, len, utf32_output);
-}
+      dst += 3;
+      buffer_start += 4;
+    }
+    if (buffer_start + 4 <= bufferptr) {
+      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                        << 8;
+      triple = scalar::utf32::swap_bytes(triple);
+      std::memcpy(dst, &triple, 3);
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
-    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf8_to_utf32::convert_with_errors(buf, len, utf32_output);
+      dst += 3;
+      buffer_start += 4;
+    }
+    // we may have 1, 2 or 3 bytes left and we need to decode them so let us
+    // backtrack
+    int leftover = int(bufferptr - buffer_start);
+    while (leftover > 0) {
+      while (to_base64[uint8_t(*(src - 1))] == 64) {
+        src--;
+      }
+      src--;
+      leftover--;
+    }
+  }
+  if (src < srcend + equalsigns) {
+    full_result r = scalar::base64::base64_tail_decode(
+        dst, src, srcend - src, equalsigns, options, last_chunk_options);
+    r.input_count += size_t(src - srcinit);
+    if (r.error == error_code::INVALID_BASE64_CHARACTER ||
+        r.error == error_code::BASE64_EXTRA_BITS) {
+      return r;
+    } else {
+      r.output_count += size_t(dst - dstinit);
+    }
+    if (last_chunk_options != stop_before_partial &&
+        r.error == error_code::SUCCESS && equalsigns > 0) {
+      // additional checks
+      if ((r.output_count % 3 == 0) ||
+          ((r.output_count % 3) + 1 + equalsigns != 4)) {
+        r.error = error_code::INVALID_BASE64_CHARACTER;
+        r.input_count = equallocation;
+      }
+    }
+    return r;
+  }
+  if (equalsigns > 0) {
+    if ((size_t(dst - dstinit) % 3 == 0) ||
+        ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
+    }
+  }
+  return {SUCCESS, srclen, size_t(dst - dstinit)};
 }
+/* end file src/haswell/avx2_base64.cpp */
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
-    const char *input, size_t size, char32_t *utf32_output) const noexcept {
-  return scalar::utf8_to_utf32::convert_valid(input, size, utf32_output);
-}
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len,
-                                                              latin1_output);
-}
+/* begin file src/generic/buf_block_reader.h */
+namespace simdutf {
+namespace haswell {
+namespace {
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len,
-                                                           latin1_output);
-}
+// Walks through a buffer in block-sized increments, loading the last part with
+// spaces
+template <size_t STEP_SIZE> struct buf_block_reader {
+public:
+  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  simdutf_really_inline size_t block_index();
+  simdutf_really_inline bool has_full_block() const;
+  simdutf_really_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0
+   * (in which case this function fills the buffer with spaces and returns 0. In
+   * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
+   * block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
+  simdutf_really_inline void advance();
 
-simdutf_warn_unused result
-implementation::convert_utf16le_to_latin1_with_errors(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
-      buf, len, latin1_output);
-}
+private:
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
+};
 
-simdutf_warn_unused result
-implementation::convert_utf16be_to_latin1_with_errors(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
-      buf, len, latin1_output);
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char *format_input_text_64(const uint8_t *text) {
+  static char *buf =
+      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(
-      buf, len, latin1_output);
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char *format_input_text(const simd8x64<uint8_t> &in) {
+  static char *buf =
+      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  in.store(reinterpret_cast<uint8_t *>(buf));
+  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') {
+      buf[i] = '_';
+    }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len,
-                                                                 latin1_output);
+simdutf_unused static char *format_mask(uint64_t mask) {
+  static char *buf = reinterpret_cast<char *>(malloc(64 + 1));
+  for (size_t i = 0; i < 64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len,
-                                                            utf8_output);
-}
+template <size_t STEP_SIZE>
+simdutf_really_inline
+buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
+    : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
+      idx{0} {}
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
+template <size_t STEP_SIZE>
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
+  return idx;
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
-      buf, len, utf8_output);
+template <size_t STEP_SIZE>
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
-      buf, len, utf8_output);
+template <size_t STEP_SIZE>
+simdutf_really_inline const uint8_t *
+buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len,
-                                                                  utf8_output);
+template <size_t STEP_SIZE>
+simdutf_really_inline size_t
+buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  if (len == idx) {
+    return 0;
+  } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+  std::memset(dst, 0x20,
+              STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
+                          // to write out 8 or 16 bytes at once.
+  std::memcpy(dst, buf + idx, len - idx);
+  return len - idx;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len,
-                                                               utf8_output);
+template <size_t STEP_SIZE>
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
-}
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/buf_block_reader.h */
+/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf8_validation {
 
-simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
-}
+using namespace simd;
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
-}
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
-}
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
-}
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
-}
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len,
-                                                             utf16_output);
-}
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len,
-                                                          utf16_output);
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
 }
-
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
-      buf, len, utf16_output);
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
-      buf, len, utf16_output);
+//
+// Return nonzero if there are incomplete multibyte characters at the end of the
+// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
+//
+simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+  // If the previous input's last 3 bytes match this, they're too short (they
+  // ended at EOF):
+  // ... 1111____ 111_____ 11______
+  static const uint8_t max_array[32] = {255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        0b11110000u - 1,
+                                        0b11100000u - 1,
+                                        0b11000000u - 1};
+  const simd8<uint8_t> max_value(
+      &max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
+  return input.gt_bits(max_value);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(
-      buf, len, utf16_output);
-}
+struct utf8_checker {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
+  // The last input we received
+  simd8<uint8_t> prev_input_block;
+  // Whether the last input we received was incomplete (used for ASCII fast
+  // path)
+  simd8<uint8_t> prev_incomplete;
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len,
-                                                                utf16_output);
-}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
+  }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len,
-                                                             utf32_output);
-}
+  // The only problem that can happen at EOF is that a multibyte character is
+  // too short or a byte value too large in the last bytes: check_special_cases
+  // only checks for bytes too large in the first of two bytes.
+  simdutf_really_inline void check_eof() {
+    // If the previous block had incomplete UTF-8 characters at the end, an
+    // ASCII block can't possibly finish them.
+    this->error |= this->prev_incomplete;
+  }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len,
-                                                          utf32_output);
-}
+  simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
+    if (simdutf_likely(is_ascii(input))) {
+      this->error |= this->prev_incomplete;
+    } else {
+      // you might think that a for-loop would work, but under Visual Studio, it
+      // is not good enough.
+      static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                        (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+      if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+      } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+        this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+      }
+      this->prev_incomplete =
+          is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
+      this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
+    }
+  }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-      buf, len, utf32_output);
-}
+  // do not forget to call check_eof!
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-      buf, len, utf32_output);
-}
+}; // struct utf8_checker
+} // namespace utf8_validation
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(
-      buf, len, utf32_output);
-}
+using utf8_validation::utf8_checker;
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len,
-                                                                utf32_output);
-}
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+/* begin file src/generic/utf8_validation/utf8_validator.h */
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf8_validation {
 
-void implementation::change_endianness_utf16(const char16_t *input,
-                                             size_t length,
-                                             char16_t *output) const noexcept {
-  scalar::utf16::change_endianness_utf16(input, length, output);
+/**
+ * Validates that the string is actual UTF-8.
+ */
+template <class checker>
+bool generic_validate_utf8(const uint8_t *input, size_t length) {
+  checker c{};
+  buf_block_reader<64> reader(input, length);
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    c.check_next_input(in);
+    reader.advance();
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  c.check_next_input(in);
+  reader.advance();
+  c.check_eof();
+  return !c.errors();
 }
 
-simdutf_warn_unused size_t implementation::count_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
+bool generic_validate_utf8(const char *input, size_t length) {
+  return generic_validate_utf8<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::count_code_points<endianness::BIG>(input, length);
+/**
+ * Validates that the string is actual UTF-8 and stops on errors.
+ */
+template <class checker>
+result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
+  checker c{};
+  buf_block_reader<64> reader(input, length);
+  size_t count{0};
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    c.check_next_input(in);
+    if (c.errors()) {
+      if (count != 0) {
+        count--;
+      } // Sometimes the error is only detected in the next chunk
+      result res = scalar::utf8::rewind_and_validate_with_errors(
+          reinterpret_cast<const char *>(input),
+          reinterpret_cast<const char *>(input + count), length - count);
+      res.count += count;
+      return res;
+    }
+    reader.advance();
+    count += 64;
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  c.check_next_input(in);
+  reader.advance();
+  c.check_eof();
+  if (c.errors()) {
+    if (count != 0) {
+      count--;
+    } // Sometimes the error is only detected in the next chunk
+    result res = scalar::utf8::rewind_and_validate_with_errors(
+        reinterpret_cast<const char *>(input),
+        reinterpret_cast<const char *>(input) + count, length - count);
+    res.count += count;
+    return res;
+  } else {
+    return result(error_code::SUCCESS, length);
+  }
 }
 
-simdutf_warn_unused size_t
-implementation::count_utf8(const char *input, size_t length) const noexcept {
-  return scalar::utf8::count_code_points(input, length);
+result generic_validate_utf8_with_errors(const char *input, size_t length) {
+  return generic_validate_utf8_with_errors<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
 }
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
-    const char *buf, size_t len) const noexcept {
-  return scalar::utf8::count_code_points(buf, len);
+template <class checker>
+bool generic_validate_ascii(const uint8_t *input, size_t length) {
+  buf_block_reader<64> reader(input, length);
+  uint8_t blocks[64]{};
+  simd::simd8x64<uint8_t> running_or(blocks);
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    running_or |= in;
+    reader.advance();
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  running_or |= in;
+  return running_or.is_ascii();
 }
 
-simdutf_warn_unused size_t
-implementation::latin1_length_from_utf16(size_t length) const noexcept {
-  return scalar::utf16::latin1_length_from_utf16(length);
+bool generic_validate_ascii(const char *input, size_t length) {
+  return generic_validate_ascii<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
 }
 
-simdutf_warn_unused size_t
-implementation::latin1_length_from_utf32(size_t length) const noexcept {
-  return length;
-}
+template <class checker>
+result generic_validate_ascii_with_errors(const uint8_t *input, size_t length) {
+  buf_block_reader<64> reader(input, length);
+  size_t count{0};
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    if (!in.is_ascii()) {
+      result res = scalar::ascii::validate_with_errors(
+          reinterpret_cast<const char *>(input + count), length - count);
+      return result(res.error, count + res.count);
+    }
+    reader.advance();
 
-simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
-    const char *input, size_t length) const noexcept {
-  size_t answer = length;
-  size_t i = 0;
-  auto pop = [](uint64_t v) {
-    return (size_t)(((v >> 7) & UINT64_C(0x0101010101010101)) *
-                        UINT64_C(0x0101010101010101) >>
-                    56);
-  };
-  for (; i + 32 <= length; i += 32) {
-    uint64_t v;
-    memcpy(&v, input + i, 8);
-    answer += pop(v);
-    memcpy(&v, input + i + 8, sizeof(v));
-    answer += pop(v);
-    memcpy(&v, input + i + 16, sizeof(v));
-    answer += pop(v);
-    memcpy(&v, input + i + 24, sizeof(v));
-    answer += pop(v);
-  }
-  for (; i + 8 <= length; i += 8) {
-    uint64_t v;
-    memcpy(&v, input + i, sizeof(v));
-    answer += pop(v);
+    count += 64;
   }
-  for (; i + 1 <= length; i += 1) {
-    answer += static_cast<uint8_t>(input[i]) >> 7;
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  if (!in.is_ascii()) {
+    result res = scalar::ascii::validate_with_errors(
+        reinterpret_cast<const char *>(input + count), length - count);
+    return result(res.error, count + res.count);
+  } else {
+    return result(error_code::SUCCESS, length);
   }
-  return answer;
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input,
-                                                                   length);
+result generic_validate_ascii_with_errors(const char *input, size_t length) {
+  return generic_validate_ascii_with_errors<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
-}
+} // namespace utf8_validation
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_validator.h */
+// transcoding from UTF-8 to UTF-16
+/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input,
-                                                                    length);
-}
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf8_to_utf16 {
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
-}
+using namespace simd;
 
-simdutf_warn_unused size_t
-implementation::utf16_length_from_latin1(size_t length) const noexcept {
-  return scalar::latin1::utf16_length_from_latin1(length);
+template <endianness endian>
+simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
+                                         char16_t *utf16_output) noexcept {
+  // The implementation is not specific to haswell and should be moved to the
+  // generic directory.
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    // this loop could be unrolled further. For example, we could process the
+    // mask far more than 64 bytes.
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if (in.is_ascii()) {
+      in.store_ascii_as_utf16<endian>(utf16_output);
+      utf16_output += 64;
+      pos += 64;
+    } else {
+      // Slow path. We hope that the compiler will recognize that this is a slow
+      // path. Anything that is not a continuation mask is a 'leading byte',
+      // that is, the start of a new code point.
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+      // -65 is 0b10111111 in two-complement's, so largest possible continuation
+      // byte
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      // The *start* of code points is not so useful, rather, we want the *end*
+      // of code points.
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times when using solely
+      // the slow/regular path, and at least four times if there are fast paths.
+      while (pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        //
+        // Thus we may allow convert_masked_utf8_to_utf16 to process
+        // more bytes at a time under a fast-path mode where 16 bytes
+        // are consumed at once (e.g., when encountering ASCII).
+        size_t consumed = convert_masked_utf8_to_utf16<endian>(
+            input + pos, utf8_end_of_code_point_mask, utf16_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block. These bytes will be processed again. So we have an
+      // 80% efficiency (in the worst case). In practice we expect an
+      // 85% to 90% efficiency.
+    }
+  }
+  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
+      input + pos, size - pos, utf16_output);
+  return utf16_output - start;
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
-    const char *input, size_t length) const noexcept {
-  return scalar::utf8::utf16_length_from_utf8(input, length);
-}
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
-    const char32_t *input, size_t length) const noexcept {
-  return scalar::utf32::utf8_length_from_utf32(input, length);
-}
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf8_to_utf16 {
+using namespace simd;
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
-    const char32_t *input, size_t length) const noexcept {
-  return scalar::utf32::utf16_length_from_utf32(input, length);
-}
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
-simdutf_warn_unused size_t
-implementation::utf32_length_from_latin1(size_t length) const noexcept {
-  return scalar::latin1::utf32_length_from_latin1(length);
-}
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
-    const char *input, size_t length) const noexcept {
-  return scalar::utf8::count_code_points(input, length);
-}
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
 
-simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
-    const char *input, size_t length) const noexcept {
-  return scalar::base64::maximal_binary_length_from_base64(input, length);
-}
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
 
-simdutf_warn_unused result implementation::base64_to_binary(
-    const char *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  while (length > 0 &&
-         scalar::base64::is_ascii_white_space(input[length - 1])) {
-    length--;
-  }
-  size_t equallocation =
-      length; // location of the first padding character if any
-  size_t equalsigns = 0;
-  if (length > 0 && input[length - 1] == '=') {
-    equallocation = length - 1;
-    length -= 1;
-    equalsigns++;
-    while (length > 0 &&
-           scalar::base64::is_ascii_white_space(input[length - 1])) {
-      length--;
-    }
-    if (length > 0 && input[length - 1] == '=') {
-      equallocation = length - 1;
-      equalsigns++;
-      length -= 1;
-    }
-  }
-  if (length == 0) {
-    if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
-    }
-    return {SUCCESS, 0};
-  }
-  result r = scalar::base64::base64_tail_decode(
-      output, input, length, equalsigns, options, last_chunk_options);
-  if (last_chunk_options != stop_before_partial &&
-      r.error == error_code::SUCCESS && equalsigns > 0) {
-    // additional checks
-    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
-    }
-  }
-  return r;
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
+}
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
 }
 
-simdutf_warn_unused full_result implementation::base64_to_binary_details(
-    const char *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  while (length > 0 &&
-         scalar::base64::is_ascii_white_space(input[length - 1])) {
-    length--;
-  }
-  size_t equallocation =
-      length; // location of the first padding character if any
-  size_t equalsigns = 0;
-  if (length > 0 && input[length - 1] == '=') {
-    equallocation = length - 1;
-    length -= 1;
-    equalsigns++;
-    while (length > 0 &&
-           scalar::base64::is_ascii_white_space(input[length - 1])) {
-      length--;
-    }
-    if (length > 0 && input[length - 1] == '=') {
-      equallocation = length - 1;
-      equalsigns++;
-      length -= 1;
-    }
-  }
-  if (length == 0) {
-    if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation, 0};
-    }
-    return {SUCCESS, 0, 0};
-  }
-  full_result r = scalar::base64::base64_tail_decode(
-      output, input, length, equalsigns, options, last_chunk_options);
-  if (last_chunk_options != stop_before_partial &&
-      r.error == error_code::SUCCESS && equalsigns > 0) {
-    // additional checks
-    if ((r.output_count % 3 == 0) ||
-        ((r.output_count % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
-    }
+struct validating_transcoder {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
+
+  validating_transcoder() : error(uint8_t(0)) {}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
   }
-  return r;
-}
-
-simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::base64::maximal_binary_length_from_base64(input, length);
-}
 
-simdutf_warn_unused result implementation::base64_to_binary(
-    const char16_t *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  while (length > 0 &&
-         scalar::base64::is_ascii_white_space(input[length - 1])) {
-    length--;
-  }
-  size_t equallocation =
-      length; // location of the first padding character if any
-  size_t equalsigns = 0;
-  if (length > 0 && input[length - 1] == '=') {
-    equallocation = length - 1;
-    length -= 1;
-    equalsigns++;
-    while (length > 0 &&
-           scalar::base64::is_ascii_white_space(input[length - 1])) {
-      length--;
+  template <endianness endian>
+  simdutf_really_inline size_t convert(const char *in, size_t size,
+                                       char16_t *utf16_output) {
+    size_t pos = 0;
+    char16_t *start{utf16_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
     }
-    if (length > 0 && input[length - 1] == '=') {
-      equallocation = length - 1;
-      equalsigns++;
-      length -= 1;
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf16<endian>(utf16_output);
+        utf16_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (utf8_continuation_mask & 1) {
+          return 0; // error
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf16<endian>(
+              in + pos, utf8_end_of_code_point_mask, utf16_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
     }
-  }
-  if (length == 0) {
-    if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
+    if (errors()) {
+      return 0;
     }
-    return {SUCCESS, 0};
-  }
-  result r = scalar::base64::base64_tail_decode(
-      output, input, length, equalsigns, options, last_chunk_options);
-  if (last_chunk_options != stop_before_partial &&
-      r.error == error_code::SUCCESS && equalsigns > 0) {
-    // additional checks
-    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
+    if (pos < size) {
+      size_t howmany = scalar::utf8_to_utf16::convert<endian>(
+          in + pos, size - pos, utf16_output);
+      if (howmany == 0) {
+        return 0;
+      }
+      utf16_output += howmany;
     }
+    return utf16_output - start;
   }
-  return r;
-}
 
-simdutf_warn_unused full_result implementation::base64_to_binary_details(
-    const char16_t *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  while (length > 0 &&
-         scalar::base64::is_ascii_white_space(input[length - 1])) {
-    length--;
-  }
-  size_t equallocation =
-      length; // location of the first padding character if any
-  size_t equalsigns = 0;
-  if (length > 0 && input[length - 1] == '=') {
-    equallocation = length - 1;
-    length -= 1;
-    equalsigns++;
-    while (length > 0 &&
-           scalar::base64::is_ascii_white_space(input[length - 1])) {
-      length--;
+  template <endianness endian>
+  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+                                                   char16_t *utf16_output) {
+    size_t pos = 0;
+    char16_t *start{utf16_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
     }
-    if (length > 0 && input[length - 1] == '=') {
-      equallocation = length - 1;
-      equalsigns++;
-      length -= 1;
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf16<endian>(utf16_output);
+        utf16_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (errors() || (utf8_continuation_mask & 1)) {
+          // rewind_and_convert_with_errors will seek a potential error from
+          // in+pos onward, with the ability to go back up to pos bytes, and
+          // read size-pos bytes forward.
+          result res =
+              scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+                  pos, in + pos, size - pos, utf16_output);
+          res.count += pos;
+          return res;
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf16<endian>(
+              in + pos, utf8_end_of_code_point_mask, utf16_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
     }
-  }
-  if (length == 0) {
-    if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation, 0};
+    if (errors()) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res =
+          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+              pos, in + pos, size - pos, utf16_output);
+      res.count += pos;
+      return res;
     }
-    return {SUCCESS, 0, 0};
-  }
-  full_result r = scalar::base64::base64_tail_decode(
-      output, input, length, equalsigns, options, last_chunk_options);
-  if (last_chunk_options != stop_before_partial &&
-      r.error == error_code::SUCCESS && equalsigns > 0) {
-    // additional checks
-    if ((r.output_count % 3 == 0) ||
-        ((r.output_count % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
+    if (pos < size) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res =
+          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+              pos, in + pos, size - pos, utf16_output);
+      if (res.error) { // In case of error, we want the error position
+        res.count += pos;
+        return res;
+      } else { // In case of success, we want the number of word written
+        utf16_output += res.count;
+      }
     }
+    return result(error_code::SUCCESS, utf16_output - start);
   }
-  return r;
-}
 
-simdutf_warn_unused size_t implementation::base64_length_from_binary(
-    size_t length, base64_options options) const noexcept {
-  return scalar::base64::base64_length_from_binary(length, options);
-}
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
 
-size_t implementation::binary_to_base64(const char *input, size_t length,
-                                        char *output,
-                                        base64_options options) const noexcept {
-  return scalar::base64::tail_encode_base64(output, input, length, options);
-}
-} // namespace fallback
+}; // struct utf8_checker
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace haswell
 } // namespace simdutf
+/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+// transcoding from UTF-8 to UTF-32
+/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
 
-/* begin file src/simdutf/fallback/end.h */
-/* end file src/simdutf/fallback/end.h */
-/* end file src/fallback/implementation.cpp */
-#endif
-#if SIMDUTF_IMPLEMENTATION_ICELAKE
-/* begin file src/icelake/implementation.cpp */
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf8_to_utf32 {
 
+using namespace simd;
 
-/* begin file src/simdutf/icelake/begin.h */
-// redefining SIMDUTF_IMPLEMENTATION to "icelake"
-// #define SIMDUTF_IMPLEMENTATION icelake
+simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
+                                         char32_t *utf32_output) noexcept {
+  size_t pos = 0;
+  char32_t *start{utf32_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if (in.is_ascii()) {
+      in.store_ascii_as_utf32(utf32_output);
+      utf32_output += 64;
+      pos += 64;
+    } else {
+      // -65 is 0b10111111 in two-complement's, so largest possible continuation
+      // byte
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      size_t max_starting_point = (pos + 64) - 12;
+      while (pos < max_starting_point) {
+        size_t consumed = convert_masked_utf8_to_utf32(
+            input + pos, utf8_end_of_code_point_mask, utf32_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+    }
+  }
+  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
+                                                       utf32_output);
+  return utf32_output - start;
+}
 
-#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
-// nothing needed.
-#else
-SIMDUTF_TARGET_ICELAKE
-#endif
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
+/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
-#if SIMDUTF_GCC11ORMORE // workaround for
-                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
-// clang-format off
-SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
-// clang-format on
-#endif // end of workaround
-/* end file src/simdutf/icelake/begin.h */
 namespace simdutf {
-namespace icelake {
+namespace haswell {
 namespace {
-#ifndef SIMDUTF_ICELAKE_H
-  #error "icelake.h must be included"
-#endif
-/* begin file src/icelake/icelake_utf8_common.inl.cpp */
-// Common procedures for both validating and non-validating conversions from
-// UTF-8.
-enum block_processing_mode { SIMDUTF_FULL, SIMDUTF_TAIL };
+namespace utf8_to_utf32 {
+using namespace simd;
 
-using utf8_to_utf16_result = std::pair<const char *, char16_t *>;
-using utf8_to_utf32_result = std::pair<const char *, uint32_t *>;
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
-/*
-    process_block_utf8_to_utf16 converts up to 64 bytes from 'in' from UTF-8
-    to UTF-16. When tail = SIMDUTF_FULL, then the full input buffer (64 bytes)
-    might be used. When tail = SIMDUTF_TAIL, we take into account 'gap' which
-    indicates how many input bytes are relevant.
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
 
-    Returns true when the result is correct, otherwise it returns false.
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
 
-    The provided in and out pointers are advanced according to how many input
-    bytes have been processed, upon success.
-*/
-template <block_processing_mode tail, endianness big_endian>
-simdutf_really_inline bool
-process_block_utf8_to_utf16(const char *&in, char16_t *&out, size_t gap) {
-  // constants
-  __m512i mask_identity = _mm512_set_epi8(
-      63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46,
-      45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28,
-      27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9,
-      8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __m512i mask_c0c0c0c0 = _mm512_set1_epi32(0xc0c0c0c0);
-  __m512i mask_80808080 = _mm512_set1_epi32(0x80808080);
-  __m512i mask_f0f0f0f0 = _mm512_set1_epi32(0xf0f0f0f0);
-  __m512i mask_dfdfdfdf_tail = _mm512_set_epi64(
-      0xffffdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf,
-      0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf,
-      0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf);
-  __m512i mask_c2c2c2c2 = _mm512_set1_epi32(0xc2c2c2c2);
-  __m512i mask_ffffffff = _mm512_set1_epi32(0xffffffff);
-  __m512i mask_d7c0d7c0 = _mm512_set1_epi32(0xd7c0d7c0);
-  __m512i mask_dc00dc00 = _mm512_set1_epi32(0xdc00dc00);
-  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  // Note that 'tail' is a compile-time constant !
-  __mmask64 b =
-      (tail == SIMDUTF_FULL) ? 0xFFFFFFFFFFFFFFFF : (uint64_t(1) << gap) - 1;
-  __m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in)
-                                         : _mm512_maskz_loadu_epi8(b, in);
-  __mmask64 m1 = (tail == SIMDUTF_FULL)
-                     ? _mm512_cmplt_epu8_mask(input, mask_80808080)
-                     : _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080);
-  if (_ktestc_mask64_u8(m1,
-                        b)) { // NOT(m1) AND b -- if all zeroes, then all ASCII
-                              // alternatively, we could do 'if (m1 == b) { '
-    if (tail == SIMDUTF_FULL) {
-      in += 64; // consumed 64 bytes
-      // we convert a full 64-byte block, writing 128 bytes.
-      __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
-      if (big_endian) {
-        input1 = _mm512_shuffle_epi8(input1, byteflip);
-      }
-      _mm512_storeu_si512(out, input1);
-      out += 32;
-      __m512i input2 =
-          _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
-      if (big_endian) {
-        input2 = _mm512_shuffle_epi8(input2, byteflip);
-      }
-      _mm512_storeu_si512(out, input2);
-      out += 32;
-      return true; // we are done
-    } else {
-      in += gap;
-      if (gap <= 32) {
-        __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
-        if (big_endian) {
-          input1 = _mm512_shuffle_epi8(input1, byteflip);
-        }
-        _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << (gap)) - 1),
-                                 input1);
-        out += gap;
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
+}
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
+}
+
+struct validating_transcoder {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
+
+  validating_transcoder() : error(uint8_t(0)) {}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
+  }
+
+  simdutf_really_inline size_t convert(const char *in, size_t size,
+                                       char32_t *utf32_output) {
+    size_t pos = 0;
+    char32_t *start{utf32_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 16 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the fourth
+    // last leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf32(utf32_output);
+        utf32_output += 64;
+        pos += 64;
       } else {
-        __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
-        if (big_endian) {
-          input1 = _mm512_shuffle_epi8(input1, byteflip);
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
         }
-        _mm512_storeu_si512(out, input1);
-        out += 32;
-        __m512i input2 =
-            _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
-        if (big_endian) {
-          input2 = _mm512_shuffle_epi8(input2, byteflip);
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (utf8_continuation_mask & 1) {
+          return 0; // we have an error
         }
-        _mm512_mask_storeu_epi16(
-            out, __mmask32((uint32_t(1) << (gap - 32)) - 1), input2);
-        out += gap - 32;
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf32(
+              in + pos, utf8_end_of_code_point_mask, utf32_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
       }
-      return true; // we are done
     }
+    if (errors()) {
+      return 0;
+    }
+    if (pos < size) {
+      size_t howmany =
+          scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+      if (howmany == 0) {
+        return 0;
+      }
+      utf32_output += howmany;
+    }
+    return utf32_output - start;
   }
-  // classify characters further
-  __mmask64 m234 = _mm512_cmp_epu8_mask(
-      mask_c0c0c0c0, input,
-      _MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte
-  __mmask64 m34 =
-      _mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input,
-                           _MM_CMPINT_LT); // 0xdf < input,  3 or 4 leading byte
-
-  __mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask(
-      m234, input, mask_c2c2c2c2,
-      _MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence)
-                      // Overlong 2-byte sequence
-  if (_ktestz_mask64_u8(milltwobytes, milltwobytes) == 0) {
-    // Overlong 2-byte sequence
-    return false;
-  }
-  if (_ktestz_mask64_u8(m34, m34) == 0) {
-    // We have a 3-byte sequence and/or a 2-byte sequence, or possibly even a
-    // 4-byte sequence!
-    __mmask64 m4 = _mm512_cmp_epu8_mask(
-        input, mask_f0f0f0f0,
-        _MM_CMPINT_NLT); // 0xf0 <= zmm0 (4 byte start bytes)
-
-    __mmask64 mask_not_ascii = (tail == SIMDUTF_FULL)
-                                   ? _knot_mask64(m1)
-                                   : _kand_mask64(_knot_mask64(m1), b);
 
-    __mmask64 mp1 = _kshiftli_mask64(m234, 1);
-    __mmask64 mp2 = _kshiftli_mask64(m34, 2);
-    // We could do it as follows...
-    // if (_kortestz_mask64_u8(m4,m4)) { // compute the bitwise OR of the 64-bit
-    // masks a and b and return 1 if all zeroes but GCC generates better code
-    // when we do:
-    if (m4 == 0) { // compute the bitwise OR of the 64-bit masks a and b and
-                   // return 1 if all zeroes
-      // Fast path with 1,2,3 bytes
-      __mmask64 mc = _kor_mask64(mp1, mp2); // expected continuation bytes
-      __mmask64 m1234 = _kor_mask64(m1, m234);
-      // mismatched continuation bytes:
-      if (tail == SIMDUTF_FULL) {
-        __mmask64 xnormcm1234 = _kxnor_mask64(
-            mc,
-            m1234); // XNOR of mc and m1234 should be all zero if they differ
-        // the presence of a 1 bit indicates that they overlap.
-        // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return
-        // 1 if all zeroes.
-        if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) {
-          return false;
-        }
+  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+                                                   char32_t *utf32_output) {
+    size_t pos = 0;
+    char32_t *start{utf32_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the fourth
+    // last leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf32(utf32_output);
+        utf32_output += 64;
+        pos += 64;
       } else {
-        __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
-        if (mc != bxorm1234) {
-          return false;
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
         }
-      }
-      // mend: identifying the last bytes of each sequence to be decoded
-      __mmask64 mend = _kshiftri_mask64(m1234, 1);
-      if (tail != SIMDUTF_FULL) {
-        mend = _kor_mask64(mend, (uint64_t(1) << (gap - 1)));
-      }
-
-      __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
-      __m512i last_and_thirdu16 =
-          _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
-
-      __m512i nonasciitags = _mm512_maskz_mov_epi8(
-          mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
-      __m512i clearedbytes = _mm512_andnot_si512(
-          nonasciitags, input); // high two bits cleared where not ASCII
-      __m512i lastbytes = _mm512_maskz_permutexvar_epi8(
-          0x5555555555555555, last_and_thirdu16,
-          clearedbytes); // the last byte of each character
-
-      __mmask64 mask_before_non_ascii = _kshiftri_mask64(
-          mask_not_ascii, 1); // bytes that precede non-ASCII bytes
-      __m512i indexofsecondlastbytes = _mm512_add_epi16(
-          mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
-      __m512i beforeasciibytes =
-          _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
-      __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(
-          0x5555555555555555, indexofsecondlastbytes,
-          beforeasciibytes); // the second last bytes (of two, three byte seq,
-                             // surrogates)
-      secondlastbytes =
-          _mm512_slli_epi16(secondlastbytes, 6); // shifted into position
-
-      __m512i indexofthirdlastbytes = _mm512_add_epi16(
-          mask_ffffffff,
-          indexofsecondlastbytes); // indices of the second last bytes
-      __m512i thirdlastbyte =
-          _mm512_maskz_mov_epi8(m34,
-                                clearedbytes); // only those that are the third
-                                               // last byte of a sequence
-      __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(
-          0x5555555555555555, indexofthirdlastbytes,
-          thirdlastbyte); // the third last bytes (of three byte sequences, hi
-                          // surrogate)
-      thirdlastbytes =
-          _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position
-      __m512i Wout = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes,
-                                               thirdlastbytes, 254);
-      // the elements of Wout excluding the last element if it happens to be a
-      // high surrogate:
-
-      __mmask64 mprocessed =
-          (tail == SIMDUTF_FULL)
-              ? _pdep_u64(0xFFFFFFFF, mend)
-              : _pdep_u64(
-                    0xFFFFFFFF,
-                    _kand_mask64(
-                        mend, b)); // we adjust mend at the end of the output.
-
-      // Encodings out of range...
-      {
-        // the location of 3-byte sequence start bytes in the input
-        __mmask64 m3 = m34 & (b ^ m4);
-        // code units in Wout corresponding to 3-byte sequences.
-        __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
-        __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
-        __mmask32 Msmall800 =
-            _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
-        __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
-        __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
-        __mmask32 M3s =
-            _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
-        if (_kor_mask32(Msmall800, M3s)) {
-          return false;
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (errors() || (utf8_continuation_mask & 1)) {
+          result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+              pos, in + pos, size - pos, utf32_output);
+          res.count += pos;
+          return res;
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf32(
+              in + pos, utf8_end_of_code_point_mask, utf32_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
         }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
       }
-      int64_t nout = _mm_popcnt_u64(mprocessed);
-      in += 64 - _lzcnt_u64(mprocessed);
-      if (big_endian) {
-        Wout = _mm512_shuffle_epi8(Wout, byteflip);
+    }
+    if (errors()) {
+      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, utf32_output);
+      res.count += pos;
+      return res;
+    }
+    if (pos < size) {
+      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, utf32_output);
+      if (res.error) { // In case of error, we want the error position
+        res.count += pos;
+        return res;
+      } else { // In case of success, we want the number of word written
+        utf32_output += res.count;
       }
-      _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
-      out += nout;
-      return true; // ok
     }
-    //
-    // We have a 4-byte sequence, this is the general case.
-    // Slow!
-    __mmask64 mp3 = _kshiftli_mask64(m4, 3);
-    __mmask64 mc =
-        _kor_mask64(_kor_mask64(mp1, mp2), mp3); // expected continuation bytes
-    __mmask64 m1234 = _kor_mask64(m1, m234);
+    return result(error_code::SUCCESS, utf32_output - start);
+  }
 
-    // mend: identifying the last bytes of each sequence to be decoded
-    __mmask64 mend =
-        _kor_mask64(_kshiftri_mask64(_kor_mask64(mp3, m1234), 1), mp3);
-    if (tail != SIMDUTF_FULL) {
-      mend = _kor_mask64(mend, __mmask64(uint64_t(1) << (gap - 1)));
-    }
-    __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
-    __m512i last_and_thirdu16 =
-        _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
 
-    __m512i nonasciitags = _mm512_maskz_mov_epi8(
-        mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
-    __m512i clearedbytes = _mm512_andnot_si512(
-        nonasciitags, input); // high two bits cleared where not ASCII
-    __m512i lastbytes = _mm512_maskz_permutexvar_epi8(
-        0x5555555555555555, last_and_thirdu16,
-        clearedbytes); // the last byte of each character
+}; // struct utf8_checker
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
+// other functions
+/* begin file src/generic/utf8.h */
 
-    __mmask64 mask_before_non_ascii = _kshiftri_mask64(
-        mask_not_ascii, 1); // bytes that precede non-ASCII bytes
-    __m512i indexofsecondlastbytes = _mm512_add_epi16(
-        mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
-    __m512i beforeasciibytes =
-        _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
-    __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(
-        0x5555555555555555, indexofsecondlastbytes,
-        beforeasciibytes); // the second last bytes (of two, three byte seq,
-                           // surrogates)
-    secondlastbytes =
-        _mm512_slli_epi16(secondlastbytes, 6); // shifted into position
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf8 {
 
-    __m512i indexofthirdlastbytes = _mm512_add_epi16(
-        mask_ffffffff,
-        indexofsecondlastbytes); // indices of the second last bytes
-    __m512i thirdlastbyte = _mm512_maskz_mov_epi8(
-        m34,
-        clearedbytes); // only those that are the third last byte of a sequence
-    __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(
-        0x5555555555555555, indexofthirdlastbytes,
-        thirdlastbyte); // the third last bytes (of three byte sequences, hi
-                        // surrogate)
-    thirdlastbytes =
-        _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position
-    __m512i thirdsecondandlastbytes = _mm512_ternarylogic_epi32(
-        lastbytes, secondlastbytes, thirdlastbytes, 254);
-    uint64_t Mlo_uint64 = _pext_u64(mp3, mend);
-    __mmask32 Mlo = __mmask32(Mlo_uint64);
-    __mmask32 Mhi = __mmask32(Mlo_uint64 >> 1);
-    __m512i lo_surr_mask = _mm512_maskz_mov_epi16(
-        Mlo,
-        mask_dc00dc00); // lo surr: 1101110000000000, other:  0000000000000000
-    __m512i shifted4_thirdsecondandlastbytes =
-        _mm512_srli_epi16(thirdsecondandlastbytes,
-                          4); // hi surr: 00000WVUTSRQPNML  vuts = WVUTS - 1
-    __m512i tagged_lo_surrogates = _mm512_or_si512(
-        thirdsecondandlastbytes,
-        lo_surr_mask); // lo surr: 110111KJHGFEDCBA, other:  unchanged
-    __m512i Wout = _mm512_mask_add_epi16(
-        tagged_lo_surrogates, Mhi, shifted4_thirdsecondandlastbytes,
-        mask_d7c0d7c0); // hi sur: 110110vutsRQPNML, other:  unchanged
-    // the elements of Wout excluding the last element if it happens to be a
-    // high surrogate:
-    __mmask32 Mout = ~(Mhi & 0x80000000);
-    __mmask64 mprocessed =
-        (tail == SIMDUTF_FULL)
-            ? _pdep_u64(Mout, mend)
-            : _pdep_u64(
-                  Mout,
-                  _kand_mask64(mend,
-                               b)); // we adjust mend at the end of the output.
+using namespace simd;
 
-    // mismatched continuation bytes:
-    if (tail == SIMDUTF_FULL) {
-      __mmask64 xnormcm1234 = _kxnor_mask64(
-          mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
-      // the presence of a 1 bit indicates that they overlap.
-      // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1
-      // if all zeroes.
-      if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) {
-        return false;
-      }
-    } else {
-      __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
-      if (mc != bxorm1234) {
-        return false;
-      }
-    }
-    // Encodings out of range...
-    {
-      // the location of 3-byte sequence start bytes in the input
-      __mmask64 m3 = m34 & (b ^ m4);
-      // code units in Wout corresponding to 3-byte sequences.
-      __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
-      __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
-      __mmask32 Msmall800 =
-          _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
-      __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
-      __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
-      __mmask32 M3s =
-          _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
-      __m512i mask_04000400 = _mm512_set1_epi32(0x04000400);
-      __mmask32 M4s =
-          _mm512_mask_cmpge_epu16_mask(Mhi, Moutminusd800, mask_04000400);
-      if (!_kortestz_mask32_u8(M4s, _kor_mask32(Msmall800, M3s))) {
-        return false;
-      }
-    }
-    in += 64 - _lzcnt_u64(mprocessed);
-    int64_t nout = _mm_popcnt_u64(mprocessed);
-    if (big_endian) {
-      Wout = _mm512_shuffle_epi8(Wout, byteflip);
+simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos + 64 <= size; pos += 64) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    uint64_t utf8_continuation_mask = input.gt(-65);
+    count += count_ones(utf8_continuation_mask);
+  }
+  return count + scalar::utf8::count_code_points(in + pos, size - pos);
+}
+
+simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
+                                                    size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  // This algorithm could no doubt be improved!
+  for (; pos + 64 <= size; pos += 64) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+    // We count one word for anything that is not a continuation (so
+    // leading bytes).
+    count += 64 - count_ones(utf8_continuation_mask);
+    int64_t utf8_4byte = input.gteq_unsigned(240);
+    count += count_ones(utf8_4byte);
+  }
+  return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
+}
+} // namespace utf8
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/utf8.h */
+/* begin file src/generic/utf16.h */
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf16 {
+
+template <endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t *in,
+                                               size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos < size / 32 * 32; pos += 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    if (!match_system(big_endian)) {
+      input.swap_bytes();
     }
-    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
-    out += nout;
-    return true; // ok
+    uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+    count += count_ones(not_pair) / 2;
   }
-  // Fast path 2: all ASCII or 2 byte
-  __mmask64 continuation_or_ascii = (tail == SIMDUTF_FULL)
-                                        ? _knot_mask64(m234)
-                                        : _kand_mask64(_knot_mask64(m234), b);
-  // on top of -0xc0 we subtract -2 which we get back later of the
-  // continuation byte tags
-  __m512i leading2byte = _mm512_maskz_sub_epi8(m234, input, mask_c2c2c2c2);
-  __mmask64 leading = tail == (tail == SIMDUTF_FULL)
-                          ? _kor_mask64(m1, m234)
-                          : _kand_mask64(_kor_mask64(m1, m234),
-                                         b); // first bytes of each sequence
-  if (tail == SIMDUTF_FULL) {
-    __mmask64 xnor234leading =
-        _kxnor_mask64(_kshiftli_mask64(m234, 1), leading);
-    if (!_kortestz_mask64_u8(xnor234leading, xnor234leading)) {
-      return false;
-    }
-  } else {
-    __mmask64 bxorleading = _kxor_mask64(b, leading);
-    if (_kshiftli_mask64(m234, 1) != bxorleading) {
-      return false;
+  return count +
+         scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
+}
+
+template <endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in,
+                                                    size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  // This algorithm could no doubt be improved!
+  for (; pos < size / 32 * 32; pos += 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    if (!match_system(big_endian)) {
+      input.swap_bytes();
     }
-  }
-  //
-  if (tail == SIMDUTF_FULL) {
-    // In the two-byte/ASCII scenario, we are easily latency bound, so we want
-    // to increment the input buffer as quickly as possible.
-    // We process 32 bytes unless the byte at index 32 is a continuation byte,
-    // in which case we include it as well for a total of 33 bytes.
-    // Note that if x is an ASCII byte, then the following is false:
-    // int8_t(x) <= int8_t(0xc0) under two's complement.
-    in += 32;
-    if (int8_t(*in) <= int8_t(0xc0))
-      in++;
-    // The alternative is to do
-    // in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
-    // but it requires loading the input, doing the mask computation, and
-    // converting back the mask to a general register. It just takes too long,
-    // leaving the processor likely to be idle.
-  } else {
-    in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
-  }
-  __m512i lead = _mm512_maskz_compress_epi8(
-      leading, leading2byte); // will contain zero for ascii, and the data
-  lead = _mm512_cvtepu8_epi16(
-      _mm512_castsi512_si256(lead)); // ... zero extended into code units
-  __m512i follow = _mm512_maskz_compress_epi8(
-      continuation_or_ascii, input); // the last bytes of each sequence
-  follow = _mm512_cvtepu8_epi16(
-      _mm512_castsi512_si256(follow)); // ... zero extended into code units
-  lead = _mm512_slli_epi16(lead, 6);   // shifted into position
-  __m512i final = _mm512_add_epi16(follow, lead); // combining lead and follow
+    uint64_t ascii_mask = input.lteq(0x7F);
+    uint64_t twobyte_mask = input.lteq(0x7FF);
+    uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
 
-  if (big_endian) {
-    final = _mm512_shuffle_epi8(final, byteflip);
+    size_t ascii_count = count_ones(ascii_mask) / 2;
+    size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
+    size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
+    size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+    count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count +
+             ascii_count;
   }
-  if (tail == SIMDUTF_FULL) {
-    // Next part is UTF-16 specific and can be generalized to UTF-32.
-    int nout = _mm_popcnt_u32(uint32_t(leading));
-    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
-    out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
-  } else {
-    int nout = int(_mm_popcnt_u64(_pdep_u64(0xFFFFFFFF, leading)));
-    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
-    out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
+  return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
+                                                                   size - pos);
+}
+
+template <endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
+                                                     size_t size) {
+  return count_code_points<big_endian>(in, size);
+}
+
+simdutf_really_inline void
+change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
+  size_t pos = 0;
+
+  while (pos < size / 32 * 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    input.swap_bytes();
+    input.store(reinterpret_cast<uint16_t *>(output));
+    pos += 32;
+    output += 32;
   }
 
-  return true; // we are fine.
+  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
 }
 
-/*
-    utf32_to_utf16_masked converts `count` lower UTF-32 code units
-    from input `utf32` into UTF-16. It differs from utf32_to_utf16
-    in that it 'masks' the writes.
+} // namespace utf16
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/utf16.h */
 
-    Returns how many 16-bit code units were stored.
+// transcoding from UTF-8 to Latin 1
+/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
 
-    byteflip is used for flipping 16-bit code units, and it should be
-        __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
-    We pass it to the (always inlined) function to encourage the compiler to
-    keep the value in a (constant) register.
-*/
-template <endianness big_endian>
-simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip,
-                                                   __m512i utf32,
-                                                   unsigned int count,
-                                                   char16_t *output) {
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf8_to_latin1 {
+using namespace simd;
 
-  const __mmask16 valid = uint16_t((1 << count) - 1);
-  // 1. check if we have any surrogate pairs
-  const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
-  const __mmask16 sp_mask =
-      _mm512_mask_cmpgt_epu32_mask(valid, utf32, v_0000_ffff);
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // For UTF-8 to Latin 1, we can allow any ASCII character, and any
+  // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
+  // 0b11000010 and nothing else.
+  //
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+  constexpr const uint8_t FORBIDDEN = 0xff;
 
-  if (sp_mask == 0) {
-    if (big_endian) {
-      _mm256_mask_storeu_epi16(
-          (__m256i *)output, valid,
-          _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),
-                              _mm512_castsi512_si256(byteflip)));
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      FORBIDDEN,
+      // 1110____ ________ <three byte lead in byte 1>
+      FORBIDDEN,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      FORBIDDEN);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
 
-    } else {
-      _mm256_mask_storeu_epi16((__m256i *)output, valid,
-                               _mm512_cvtepi32_epi16(utf32));
-    }
-    return count;
-  }
+              // ____0100 ________
+              FORBIDDEN,
+              // ____0101 ________
+              FORBIDDEN,
+              // ____011_ ________
+              FORBIDDEN, FORBIDDEN,
 
-  {
-    // build surrogate pair code units in 32-bit lanes
+              // ____1___ ________
+              FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
+              // ____1101 ________
+              FORBIDDEN, FORBIDDEN, FORBIDDEN);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
 
-    //    t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb]
-    const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000);
-    const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000);
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
 
-    //    t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000]
-    const __m512i t1 = _mm512_slli_epi32(t0, 6);
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
+}
 
-    //    t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1
-    //    to t0
-    //         0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000)
-    const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000);
-    const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4);
+struct validating_transcoder {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
 
-    //    t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1
-    //    to t0
-    //         0xba = (t2 and not v_fc00_fc000) or v_d800_dc00
-    const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00);
-    const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00);
-    const __m512i t3 =
-        _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
-    const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
-    __m512i t5 = _mm512_ror_epi32(t4, 16);
-    // Here we want to trim all of the upper 16-bit code units from the 2-byte
-    // characters represented as 4-byte values. We can compute it from
-    // sp_mask or the following... It can be more optimized!
-    const __mmask32 nonzero = _kor_mask32(
-        0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
-    const __mmask32 nonzero_masked =
-        _kand_mask32(nonzero, __mmask32((uint64_t(1) << (2 * count)) - 1));
-    if (big_endian) {
-      t5 = _mm512_shuffle_epi8(t5, byteflip);
+  validating_transcoder() : error(uint8_t(0)) {}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    this->error |= check_special_cases(input, prev1);
+  }
+
+  simdutf_really_inline size_t convert(const char *in, size_t size,
+                                       char *latin1_output) {
+    size_t pos = 0;
+    char *start{latin1_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 16 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 16; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) >
+                       -65); // twos complement of -65 is 1011 1111 ...
     }
-    // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability
-    // (zen4)
-    __m512i compressed = _mm512_maskz_compress_epi16(nonzero_masked, t5);
-    _mm512_mask_storeu_epi16(
-        output,
-        (1 << (count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1,
-        compressed);
-    //_mm512_mask_compressstoreu_epi16(output, nonzero_masked, t5);
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store((int8_t *)latin1_output);
+        latin1_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask =
+            input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
+                               // this case, we also have ASCII to account for.
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_latin1(
+              in + pos, utf8_end_of_code_point_mask, latin1_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      return 0;
+    }
+    if (pos < size) {
+      size_t howmany =
+          scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
+      if (howmany == 0) {
+        return 0;
+      }
+      latin1_output += howmany;
+    }
+    return latin1_output - start;
   }
 
-  return count + static_cast<unsigned int>(count_ones(sp_mask));
-}
+  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+                                                   char *latin1_output) {
+    size_t pos = 0;
+    char *start{latin1_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store((int8_t *)latin1_output);
+        latin1_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        if (errors()) {
+          // rewind_and_convert_with_errors will seek a potential error from
+          // in+pos onward, with the ability to go back up to pos bytes, and
+          // read size-pos bytes forward.
+          result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+              pos, in + pos, size - pos, latin1_output);
+          res.count += pos;
+          return res;
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_latin1(
+              in + pos, utf8_end_of_code_point_mask, latin1_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, latin1_output);
+      res.count += pos;
+      return res;
+    }
+    if (pos < size) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, latin1_output);
+      if (res.error) { // In case of error, we want the error position
+        res.count += pos;
+        return res;
+      } else { // In case of success, we want the number of word written
+        latin1_output += res.count;
+      }
+    }
+    return result(error_code::SUCCESS, latin1_output - start);
+  }
 
-/*
-    utf32_to_utf16 converts `count` lower UTF-32 code units
-    from input `utf32` into UTF-16. It may overflow.
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
 
-    Returns how many 16-bit code units were stored.
+}; // struct utf8_checker
+} // namespace utf8_to_latin1
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
+/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
 
-    byteflip is used for flipping 16-bit code units, and it should be
-        __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
-    We pass it to the (always inlined) function to encourage the compiler to
-    keep the value in a (constant) register.
-*/
-template <endianness big_endian>
-simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip,
-                                            __m512i utf32, unsigned int count,
-                                            char16_t *output) {
-  // check if we have any surrogate pairs
-  const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
-  const __mmask16 sp_mask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf8_to_latin1 {
+using namespace simd;
 
-  if (sp_mask == 0) {
-    // technically, it should be _mm256_storeu_epi16
-    if (big_endian) {
-      _mm256_storeu_si256(
-          (__m256i *)output,
-          _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),
-                              _mm512_castsi512_si256(byteflip)));
+simdutf_really_inline size_t convert_valid(const char *in, size_t size,
+                                           char *latin1_output) {
+  size_t pos = 0;
+  char *start{latin1_output};
+  // In the worst case, we have the haswell kernel which can cause an overflow
+  // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
+  // 16 bytes, and if the data is valid, then it is entirely safe because 16
+  // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
+  // assume that you have valid UTF-8 input, so we are going to go back from the
+  // end counting 8 leading bytes, to give us a good margin.
+  size_t leading_byte = 0;
+  size_t margin = size;
+  for (; margin > 0 && leading_byte < 8; margin--) {
+    leading_byte += (int8_t(in[margin - 1]) >
+                     -65); // twos complement of -65 is 1011 1111 ...
+  }
+  // If the input is long enough, then we have that margin-1 is the eight last
+  // leading byte.
+  const size_t safety_margin = size - margin + 1; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    if (input.is_ascii()) {
+      input.store((int8_t *)latin1_output);
+      latin1_output += 64;
+      pos += 64;
     } else {
-      _mm256_storeu_si256((__m256i *)output, _mm512_cvtepi32_epi16(utf32));
+      // you might think that a for-loop would work, but under Visual Studio, it
+      // is not good enough.
+      uint64_t utf8_continuation_mask =
+          input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
+                             // this case, we also have ASCII to account for.
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times.
+      while (pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        size_t consumed = convert_masked_utf8_to_latin1(
+            in + pos, utf8_end_of_code_point_mask, latin1_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block. These bytes will be processed again. So we have an
+      // 80% efficiency (in the worst case). In practice we expect an
+      // 85% to 90% efficiency.
     }
-    return count;
   }
+  if (pos < size) {
+    size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
+                                                           latin1_output);
+    latin1_output += howmany;
+  }
+  return latin1_output - start;
+}
 
-  {
-    // build surrogate pair code units in 32-bit lanes
-
-    //    t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb]
-    const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000);
-    const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000);
-
-    //    t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000]
-    const __m512i t1 = _mm512_slli_epi32(t0, 6);
+} // namespace utf8_to_latin1
+} // namespace
+} // namespace haswell
+} // namespace simdutf
+  // namespace simdutf
+/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
 
-    //    t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1
-    //    to t0
-    //         0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000)
-    const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000);
-    const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4);
+namespace simdutf {
+namespace haswell {
 
-    //    t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1
-    //    to t0
-    //         0xba = (t2 and not v_fc00_fc000) or v_d800_dc00
-    const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00);
-    const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00);
-    const __m512i t3 =
-        _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
-    const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
-    __m512i t5 = _mm512_ror_epi32(t4, 16);
-    const __mmask32 nonzero = _kor_mask32(
-        0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
-    if (big_endian) {
-      t5 = _mm512_shuffle_epi8(t5, byteflip);
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+                                 size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  if (bom_encoding != encoding_type::unspecified) {
+    return bom_encoding;
+  }
+  int out = 0;
+  if (validate_utf8(input, length)) {
+    out |= encoding_type::UTF8;
+  }
+  if ((length % 2) == 0) {
+    if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
+                         length / 2)) {
+      out |= encoding_type::UTF16_LE;
     }
-    // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability
-    // (zen4)
-    __m512i compressed = _mm512_maskz_compress_epi16(nonzero, t5);
-    _mm512_mask_storeu_epi16(
-        output,
-        (1 << (count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1,
-        compressed);
-    //_mm512_mask_compressstoreu_epi16(output, nonzero, t5);
   }
-
-  return count + static_cast<unsigned int>(count_ones(sp_mask));
+  if ((length % 4) == 0) {
+    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+      out |= encoding_type::UTF32_LE;
+    }
+  }
+  return out;
 }
 
-/**
- * Store the last N bytes of previous followed by 512-N bytes from input.
- */
-template <int N> __m512i prev(__m512i input, __m512i previous) {
-  static_assert(N <= 32, "N must be no larger than 32");
-  const __m512i movemask =
-      _mm512_setr_epi32(28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
-  const __m512i rotated = _mm512_permutex2var_epi32(input, movemask, previous);
-#if SIMDUTF_GCC8 || SIMDUTF_GCC9
-  constexpr int shift = 16 - N; // workaround for GCC8,9
-  return _mm512_alignr_epi8(input, rotated, shift);
-#else
-  return _mm512_alignr_epi8(input, rotated, 16 - N);
-#endif // SIMDUTF_GCC8 || SIMDUTF_GCC9
+simdutf_warn_unused bool
+implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return haswell::utf8_validation::generic_validate_utf8(buf, len);
 }
 
-template <unsigned idx0, unsigned idx1, unsigned idx2, unsigned idx3>
-__m512i shuffle_epi128(__m512i v) {
-  static_assert((idx0 >= 0 && idx0 <= 3), "idx0 must be in range 0..3");
-  static_assert((idx1 >= 0 && idx1 <= 3), "idx1 must be in range 0..3");
-  static_assert((idx2 >= 0 && idx2 <= 3), "idx2 must be in range 0..3");
-  static_assert((idx3 >= 0 && idx3 <= 3), "idx3 must be in range 0..3");
-
-  constexpr unsigned shuffle = idx0 | (idx1 << 2) | (idx2 << 4) | (idx3 << 6);
-  return _mm512_shuffle_i32x4(v, v, shuffle);
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+    const char *buf, size_t len) const noexcept {
+  return haswell::utf8_validation::generic_validate_utf8_with_errors(buf, len);
 }
 
-template <unsigned idx> constexpr __m512i broadcast_epi128(__m512i v) {
-  return shuffle_epi128<idx, idx, idx, idx>(v);
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+  return haswell::utf8_validation::generic_validate_ascii(buf, len);
 }
 
-/**
- * Current unused.
- */
-template <int N> __m512i rotate_by_N_epi8(const __m512i input) {
-
-  // lanes order: 1, 2, 3, 0 => 0b00_11_10_01
-  const __m512i permuted = _mm512_shuffle_i32x4(input, input, 0x39);
-
-  return _mm512_alignr_epi8(permuted, input, N);
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+    const char *buf, size_t len) const noexcept {
+  return haswell::utf8_validation::generic_validate_ascii_with_errors(buf, len);
 }
 
-/*
-    expanded_utf8_to_utf32 converts expanded UTF-8 characters (`utf8`)
-    stored at separate 32-bit lanes.
-
-    For each lane we have also a character class (`char_class), given in form
-    0x8080800N, where N is 4 highest bits from the leading byte; 0x80 resets
-    corresponding bytes during pshufb.
-*/
-simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class,
-                                                     __m512i utf8) {
-  /*
-      Input:
-      - utf8: bytes stored at separate 32-bit code units
-      - valid: which code units have valid UTF-8 characters
-
-      Bit layout of single word. We show 4 cases for each possible
-      UTF-8 character encoding. The `?` denotes bits we must not
-      assume their value.
-
-      |10dd.dddd|10cc.cccc|10bb.bbbb|1111.0aaa| 4-byte char
-      |????.????|10cc.cccc|10bb.bbbb|1110.aaaa| 3-byte char
-      |????.????|????.????|10bb.bbbb|110a.aaaa| 2-byte char
-      |????.????|????.????|????.????|0aaa.aaaa| ASCII char
-        byte 3    byte 2    byte 1     byte 0
-  */
-
-  /* 1. Reset control bits of continuation bytes and the MSB
-        of the leading byte; this makes all bytes unsigned (and
-        does not alter ASCII char).
-
-      |00dd.dddd|00cc.cccc|00bb.bbbb|0111.0aaa| 4-byte char
-      |00??.????|00cc.cccc|00bb.bbbb|0110.aaaa| 3-byte char
-      |00??.????|00??.????|00bb.bbbb|010a.aaaa| 2-byte char
-      |00??.????|00??.????|00??.????|0aaa.aaaa| ASCII char
-       ^^        ^^        ^^        ^
-  */
-  __m512i values;
-  const __m512i v_3f3f_3f7f = _mm512_set1_epi32(0x3f3f3f7f);
-  values = _mm512_and_si512(utf8, v_3f3f_3f7f);
-
-  /* 2. Swap and join fields A-B and C-D
-
-      |0000.cccc|ccdd.dddd|0001.110a|aabb.bbbb| 4-byte char
-      |0000.cccc|cc??.????|0001.10aa|aabb.bbbb| 3-byte char
-      |0000.????|????.????|0001.0aaa|aabb.bbbb| 2-byte char
-      |0000.????|????.????|000a.aaaa|aa??.????| ASCII char */
-  const __m512i v_0140_0140 = _mm512_set1_epi32(0x01400140);
-  values = _mm512_maddubs_epi16(values, v_0140_0140);
-
-  /* 3. Swap and join fields AB & CD
+simdutf_warn_unused bool
+implementation::validate_utf16le(const char16_t *buf,
+                                 size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid UTF-16. protect the implementation from
+    // handling nullptr
+    return true;
+  }
+  const char16_t *tail = avx2_validate_utf16<endianness::LITTLE>(buf, len);
+  if (tail) {
+    return scalar::utf16::validate<endianness::LITTLE>(tail,
+                                                       len - (tail - buf));
+  } else {
+    return false;
+  }
+}
 
-      |0000.0001|110a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char
-      |0000.0001|10aa.aabb|bbbb.cccc|cc??.????| 3-byte char
-      |0000.0001|0aaa.aabb|bbbb.????|????.????| 2-byte char
-      |0000.000a|aaaa.aa??|????.????|????.????| ASCII char */
-  const __m512i v_0001_1000 = _mm512_set1_epi32(0x00011000);
-  values = _mm512_madd_epi16(values, v_0001_1000);
+simdutf_warn_unused bool
+implementation::validate_utf16be(const char16_t *buf,
+                                 size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid UTF-16. protect the implementation from
+    // handling nullptr
+    return true;
+  }
+  const char16_t *tail = avx2_validate_utf16<endianness::BIG>(buf, len);
+  if (tail) {
+    return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
+}
 
-  /* 4. Shift left the values by variable amounts to reset highest UTF-8 bits
-      |aaab.bbbb|bccc.cccd|dddd.d000|0000.0000| 4-byte char -- by 11
-      |aaaa.bbbb|bbcc.cccc|????.??00|0000.0000| 3-byte char -- by 10
-      |aaaa.abbb|bbb?.????|????.???0|0000.0000| 2-byte char -- by 9
-      |aaaa.aaa?|????.????|????.????|?000.0000| ASCII char -- by 7 */
-  {
-    /** pshufb
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  result res = avx2_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+  if (res.count != len) {
+    result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(
+        buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
+}
 
-    continuation = 0
-    ascii    = 7
-    _2_bytes = 9
-    _3_bytes = 10
-    _4_bytes = 11
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  result res = avx2_validate_utf16_with_errors<endianness::BIG>(buf, len);
+  if (res.count != len) {
+    result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(
+        buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
+}
 
-    shift_left_v3 = 4 * [
-        ascii, # 0000
-        ascii, # 0001
-        ascii, # 0010
-        ascii, # 0011
-        ascii, # 0100
-        ascii, # 0101
-        ascii, # 0110
-        ascii, # 0111
-        continuation, # 1000
-        continuation, # 1001
-        continuation, # 1010
-        continuation, # 1011
-        _2_bytes, # 1100
-        _2_bytes, # 1101
-        _3_bytes, # 1110
-        _4_bytes, # 1111
-    ] */
-    const __m512i shift_left_v3 = _mm512_setr_epi64(
-        0x0707070707070707, 0x0b0a090900000000, 0x0707070707070707,
-        0x0b0a090900000000, 0x0707070707070707, 0x0b0a090900000000,
-        0x0707070707070707, 0x0b0a090900000000);
+simdutf_warn_unused bool
+implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid UTF-32. protect the implementation from
+    // handling nullptr
+    return true;
+  }
+  const char32_t *tail = avx2_validate_utf32le(buf, len);
+  if (tail) {
+    return scalar::utf32::validate(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
+}
 
-    const __m512i shift = _mm512_shuffle_epi8(shift_left_v3, char_class);
-    values = _mm512_sllv_epi32(values, shift);
+simdutf_warn_unused result implementation::validate_utf32_with_errors(
+    const char32_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid UTF-32. protect the implementation from
+    // handling nullptr
+    return result(error_code::SUCCESS, 0);
+  }
+  result res = avx2_validate_utf32le_with_errors(buf, len);
+  if (res.count != len) {
+    result scalar_res =
+        scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
   }
+}
 
-  /* 5. Shift right the values by variable amounts to reset lowest bits
-      |0000.0000|000a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char -- by 11
-      |0000.0000|0000.0000|aaaa.bbbb|bbcc.cccc| 3-byte char -- by 16
-      |0000.0000|0000.0000|0000.0aaa|aabb.bbbb| 2-byte char -- by 21
-      |0000.0000|0000.0000|0000.0000|0aaa.aaaa| ASCII char -- by 25 */
-  {
-    // 4 * [25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 21, 21, 16, 11]
-    const __m512i shift_right = _mm512_setr_epi64(
-        0x1919191919191919, 0x0b10151500000000, 0x1919191919191919,
-        0x0b10151500000000, 0x1919191919191919, 0x0b10151500000000,
-        0x1919191919191919, 0x0b10151500000000);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
+    const char *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char *, char *> ret =
+      avx2_convert_latin1_to_utf8(buf, len, utf8_output);
+  size_t converted_chars = ret.second - utf8_output;
 
-    const __m512i shift = _mm512_shuffle_epi8(shift_right, char_class);
-    values = _mm512_srlv_epi32(values, shift);
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    converted_chars += scalar_converted_chars;
   }
 
-  return values;
+  return converted_chars;
 }
 
-simdutf_really_inline __m512i expand_and_identify(__m512i lane0, __m512i lane1,
-                                                  int &count) {
-  const __m512i merged = _mm512_mask_mov_epi32(lane0, 0x1000, lane1);
-  const __m512i expand_ver2 = _mm512_setr_epi64(
-      0x0403020103020100, 0x0605040305040302, 0x0807060507060504,
-      0x0a09080709080706, 0x0c0b0a090b0a0908, 0x0e0d0c0b0d0c0b0a,
-      0x000f0e0d0f0e0d0c, 0x0201000f01000f0e);
-  const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);
-  const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);
-  const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);
-  const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);
-  const __mmask16 leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);
-  count = static_cast<int>(count_ones(leading_bytes));
-  return _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes,
-                                    input);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char *, char16_t *> ret =
+      avx2_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t converted_chars = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars =
+        scalar::latin1_to_utf16::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_converted_chars == 0) {
+      return 0;
+    }
+    converted_chars += scalar_converted_chars;
+  }
+  return converted_chars;
 }
 
-simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) {
-  __m512i char_class = _mm512_srli_epi32(input, 4);
-  /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */
-  const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);
-  const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);
-  char_class =
-      _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea);
-  return expanded_utf8_to_utf32(char_class, input);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char *, char16_t *> ret =
+      avx2_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t converted_chars = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars =
+        scalar::latin1_to_utf16::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_converted_chars == 0) {
+      return 0;
+    }
+    converted_chars += scalar_converted_chars;
+  }
+  return converted_chars;
 }
-/* end file src/icelake/icelake_utf8_common.inl.cpp */
-/* begin file src/icelake/icelake_macros.inl.cpp */
-
-/*
-    This upcoming macro (SIMDUTF_ICELAKE_TRANSCODE16) takes 16 + 4 bytes (of a
-   UTF-8 string) and loads all possible 4-byte substring into an AVX512
-   register.
-
-    For example if we have bytes abcdefgh... we create following 32-bit lanes
-
-    [abcd|bcde|cdef|defg|efgh|...]
-     ^                          ^
-     byte 0 of reg              byte 63 of reg
-*/
-/** pshufb
-        # lane{0,1,2} have got bytes: [  0,  1,  2,  3,  4,  5,  6,  8,  9, 10,
-   11, 12, 13, 14, 15] # lane3 has got bytes:        [ 16, 17, 18, 19,  4,  5,
-   6,  8,  9, 10, 11, 12, 13, 14, 15]
 
-        expand_ver2 = [
-            # lane 0:
-            0, 1, 2, 3,
-            1, 2, 3, 4,
-            2, 3, 4, 5,
-            3, 4, 5, 6,
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::pair<const char *, char32_t *> ret =
+      avx2_convert_latin1_to_utf32(buf, len, utf32_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t converted_chars = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_converted_chars == 0) {
+      return 0;
+    }
+    converted_chars += scalar_converted_chars;
+  }
+  return converted_chars;
+}
 
-            # lane 1:
-            4, 5, 6, 7,
-            5, 6, 7, 8,
-            6, 7, 8, 9,
-            7, 8, 9, 10,
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  utf8_to_latin1::validating_transcoder converter;
+  return converter.convert(buf, len, latin1_output);
+}
 
-            # lane 2:
-             8,  9, 10, 11,
-             9, 10, 11, 12,
-            10, 11, 12, 13,
-            11, 12, 13, 14,
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  utf8_to_latin1::validating_transcoder converter;
+  return converter.convert_with_errors(buf, len, latin1_output);
+}
 
-            # lane 3 order: 13, 14, 15, 16 14, 15, 16, 17, 15, 16, 17, 18, 16,
-   17, 18, 19 12, 13, 14, 15, 13, 14, 15,  0, 14, 15,  0,  1, 15,  0,  1,  2,
-        ]
-*/
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
+    const char *input, size_t size, char *latin1_output) const noexcept {
+  return utf8_to_latin1::convert_valid(input, size, latin1_output);
+}
 
-#define SIMDUTF_ICELAKE_TRANSCODE16(LANE0, LANE1, MASKED)                      \
-  {                                                                            \
-    const __m512i merged = _mm512_mask_mov_epi32(LANE0, 0x1000, LANE1);        \
-    const __m512i expand_ver2 = _mm512_setr_epi64(                             \
-        0x0403020103020100, 0x0605040305040302, 0x0807060507060504,            \
-        0x0a09080709080706, 0x0c0b0a090b0a0908, 0x0e0d0c0b0d0c0b0a,            \
-        0x000f0e0d0f0e0d0c, 0x0201000f01000f0e);                               \
-    const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);            \
-                                                                               \
-    __mmask16 leading_bytes;                                                   \
-    const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);                       \
-    const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);                   \
-    const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);                       \
-    leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);                 \
-                                                                               \
-    __m512i char_class;                                                        \
-    char_class = _mm512_srli_epi32(input, 4);                                  \
-    /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */                     \
-    const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);                       \
-    const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);                 \
-    char_class =                                                               \
-        _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea); \
-                                                                               \
-    const int valid_count = static_cast<int>(count_ones(leading_bytes));       \
-    const __m512i utf32 = expanded_utf8_to_utf32(char_class, input);           \
-                                                                               \
-    const __m512i out = _mm512_mask_compress_epi32(_mm512_setzero_si512(),     \
-                                                   leading_bytes, utf32);      \
-                                                                               \
-    if (UTF32) {                                                               \
-      if (MASKED) {                                                            \
-        const __mmask16 valid = uint16_t((1 << valid_count) - 1);              \
-        _mm512_mask_storeu_epi32((__m512i *)output, valid, out);               \
-      } else {                                                                 \
-        _mm512_storeu_si512((__m512i *)output, out);                           \
-      }                                                                        \
-      output += valid_count;                                                   \
-    } else {                                                                   \
-      if (MASKED) {                                                            \
-        output += utf32_to_utf16_masked<big_endian>(                           \
-            byteflip, out, valid_count, reinterpret_cast<char16_t *>(output)); \
-      } else {                                                                 \
-        output += utf32_to_utf16<big_endian>(                                  \
-            byteflip, out, valid_count, reinterpret_cast<char16_t *>(output)); \
-      }                                                                        \
-    }                                                                          \
-  }
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+}
 
-#define SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(INPUT, VALID_COUNT, MASKED)       \
-  {                                                                            \
-    if (UTF32) {                                                               \
-      if (MASKED) {                                                            \
-        const __mmask16 valid_mask = uint16_t((1 << VALID_COUNT) - 1);         \
-        _mm512_mask_storeu_epi32((__m512i *)output, valid_mask, INPUT);        \
-      } else {                                                                 \
-        _mm512_storeu_si512((__m512i *)output, INPUT);                         \
-      }                                                                        \
-      output += VALID_COUNT;                                                   \
-    } else {                                                                   \
-      if (MASKED) {                                                            \
-        output += utf32_to_utf16_masked<big_endian>(                           \
-            byteflip, INPUT, VALID_COUNT,                                      \
-            reinterpret_cast<char16_t *>(output));                             \
-      } else {                                                                 \
-        output +=                                                              \
-            utf32_to_utf16<big_endian>(byteflip, INPUT, VALID_COUNT,           \
-                                       reinterpret_cast<char16_t *>(output));  \
-      }                                                                        \
-    }                                                                          \
-  }
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert<endianness::BIG>(buf, len, utf16_output);
+}
 
-#define SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)                       \
-  if (UTF32) {                                                                 \
-    const __m128i t0 = _mm512_castsi512_si128(utf8);                           \
-    const __m128i t1 = _mm512_extracti32x4_epi32(utf8, 1);                     \
-    const __m128i t2 = _mm512_extracti32x4_epi32(utf8, 2);                     \
-    const __m128i t3 = _mm512_extracti32x4_epi32(utf8, 3);                     \
-    _mm512_storeu_si512((__m512i *)(output + 0 * 16),                          \
-                        _mm512_cvtepu8_epi32(t0));                             \
-    _mm512_storeu_si512((__m512i *)(output + 1 * 16),                          \
-                        _mm512_cvtepu8_epi32(t1));                             \
-    _mm512_storeu_si512((__m512i *)(output + 2 * 16),                          \
-                        _mm512_cvtepu8_epi32(t2));                             \
-    _mm512_storeu_si512((__m512i *)(output + 3 * 16),                          \
-                        _mm512_cvtepu8_epi32(t3));                             \
-  } else {                                                                     \
-    const __m256i h0 = _mm512_castsi512_si256(utf8);                           \
-    const __m256i h1 = _mm512_extracti64x4_epi64(utf8, 1);                     \
-    if (big_endian) {                                                          \
-      _mm512_storeu_si512(                                                     \
-          (__m512i *)(output + 0 * 16),                                        \
-          _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h0), byteflip));            \
-      _mm512_storeu_si512(                                                     \
-          (__m512i *)(output + 2 * 16),                                        \
-          _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h1), byteflip));            \
-    } else {                                                                   \
-      _mm512_storeu_si512((__m512i *)(output + 0 * 16),                        \
-                          _mm512_cvtepu8_epi16(h0));                           \
-      _mm512_storeu_si512((__m512i *)(output + 2 * 16),                        \
-                          _mm512_cvtepu8_epi16(h1));                           \
-    }                                                                          \
-  }
-/* end file src/icelake/icelake_macros.inl.cpp */
-/* begin file src/icelake/icelake_from_valid_utf8.inl.cpp */
-// file included directly
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert_with_errors<endianness::LITTLE>(buf, len,
+                                                           utf16_output);
+}
 
-// File contains conversion procedure from VALID UTF-8 strings.
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+}
 
-/*
-    valid_utf8_to_fixed_length converts a valid UTF-8 string into UTF-32.
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+    const char *input, size_t size, char16_t *utf16_output) const noexcept {
+  return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,
+                                                          utf16_output);
+}
 
-    The `OUTPUT` template type decides what to do with UTF-32: store
-    it directly or convert into UTF-16 (with AVX512).
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+    const char *input, size_t size, char16_t *utf16_output) const noexcept {
+  return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,
+                                                       utf16_output);
+}
 
-    Input:
-    - str           - valid UTF-8 string
-    - len           - string length
-    - out_buffer    - output buffer
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  utf8_to_utf32::validating_transcoder converter;
+  return converter.convert(buf, len, utf32_output);
+}
 
-    Result:
-    - pair.first    - the first unprocessed input byte
-    - pair.second   - the first unprocessed output word
-*/
-template <endianness big_endian, typename OUTPUT>
-std::pair<const char *, OUTPUT *>
-valid_utf8_to_fixed_length(const char *str, size_t len, OUTPUT *dwords) {
-  constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
-  constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
-  static_assert(
-      UTF32 or UTF16,
-      "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
-  static_assert(!(UTF32 and big_endian),
-                "we do not currently support big-endian UTF-32");
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  utf8_to_utf32::validating_transcoder converter;
+  return converter.convert_with_errors(buf, len, utf32_output);
+}
 
-  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  const char *ptr = str;
-  const char *end = ptr + len;
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+    const char *input, size_t size, char32_t *utf32_output) const noexcept {
+  return utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
 
-  OUTPUT *output = dwords;
-  /**
-   * In the main loop, we consume 64 bytes per iteration,
-   * but we access 64 + 4 bytes.
-   * We check for ptr + 64 + 64 <= end because
-   * we want to be do maskless writes without overruns.
-   */
-  while (end - ptr >= 64 + 4) {
-    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
-    const __m512i v_80 = _mm512_set1_epi8(char(0x80));
-    const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
-    if (ascii == 0) {
-      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
-      output += 64;
-      ptr += 64;
-      continue;
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      haswell::avx2_convert_utf16_to_latin1<endianness::LITTLE>(buf, len,
+                                                                latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_latin1::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-    const __m512i lane0 = broadcast_epi128<0>(utf8);
-    const __m512i lane1 = broadcast_epi128<1>(utf8);
-    int valid_count0;
-    __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
-    const __m512i lane2 = broadcast_epi128<2>(utf8);
-    int valid_count1;
-    __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-    if (valid_count0 + valid_count1 <= 16) {
-      vec0 = _mm512_mask_expand_epi32(
-          vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
-      valid_count0 += valid_count1;
-      vec0 = expand_utf8_to_utf32(vec0);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-    } else {
-      vec0 = expand_utf8_to_utf32(vec0);
-      vec1 = expand_utf8_to_utf32(vec1);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      haswell::avx2_convert_utf16_to_latin1<endianness::BIG>(buf, len,
+                                                             latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_latin1::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
-    const __m512i lane3 = broadcast_epi128<3>(utf8);
-    int valid_count2;
-    __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
-    uint32_t tmp1;
-    ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
-    const __m512i lane4 = _mm512_set1_epi32(tmp1);
-    int valid_count3;
-    __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
-    if (valid_count2 + valid_count3 <= 16) {
-      vec2 = _mm512_mask_expand_epi32(
-          vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
-      valid_count2 += valid_count3;
-      vec2 = expand_utf8_to_utf32(vec2);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16le_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<result, char *> ret =
+      avx2_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
+          buf, len, latin1_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
     } else {
-      vec2 = expand_utf8_to_utf32(vec2);
-      vec3 = expand_utf8_to_utf32(vec3);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
+      ret.second += scalar_res.count;
     }
-    ptr += 4 * 16;
   }
+  ret.first.count =
+      ret.second -
+      latin1_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-  if (end - ptr >= 64) {
-    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
-    const __m512i v_80 = _mm512_set1_epi8(char(0x80));
-    const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
-    if (ascii == 0) {
-      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
-      output += 64;
-      ptr += 64;
+simdutf_warn_unused result
+implementation::convert_utf16be_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<result, char *> ret =
+      avx2_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
+                                                                latin1_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
     } else {
-      const __m512i lane0 = broadcast_epi128<0>(utf8);
-      const __m512i lane1 = broadcast_epi128<1>(utf8);
-      int valid_count0;
-      __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
-      const __m512i lane2 = broadcast_epi128<2>(utf8);
-      int valid_count1;
-      __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-      if (valid_count0 + valid_count1 <= 16) {
-        vec0 = _mm512_mask_expand_epi32(
-            vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
-        valid_count0 += valid_count1;
-        vec0 = expand_utf8_to_utf32(vec0);
-        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-      } else {
-        vec0 = expand_utf8_to_utf32(vec0);
-        vec1 = expand_utf8_to_utf32(vec1);
-        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
-      }
-
-      const __m512i lane3 = broadcast_epi128<3>(utf8);
-      SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
-
-      ptr += 3 * 16;
+      ret.second += scalar_res.count;
     }
   }
-  return {ptr, output};
+  ret.first.count =
+      ret.second -
+      latin1_output; // Set count to the number of 8-bit code units written
+  return ret.first;
 }
 
-using utf8_to_utf16_result = std::pair<const char *, char16_t *>;
-/* end file src/icelake/icelake_from_valid_utf8.inl.cpp */
-/* begin file src/icelake/icelake_utf8_validation.inl.cpp */
-// file included directly
-
-simdutf_really_inline __m512i check_special_cases(__m512i input,
-                                                  const __m512i prev1) {
-  __m512i mask1 = _mm512_setr_epi64(0x0202020202020202, 0x4915012180808080,
-                                    0x0202020202020202, 0x4915012180808080,
-                                    0x0202020202020202, 0x4915012180808080,
-                                    0x0202020202020202, 0x4915012180808080);
-  const __m512i v_0f = _mm512_set1_epi8(0x0f);
-  __m512i index1 = _mm512_and_si512(_mm512_srli_epi16(prev1, 4), v_0f);
-
-  __m512i byte_1_high = _mm512_shuffle_epi8(mask1, index1);
-  __m512i mask2 = _mm512_setr_epi64(0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
-                                    0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
-                                    0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
-                                    0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb);
-  __m512i index2 = _mm512_and_si512(prev1, v_0f);
-
-  __m512i byte_1_low = _mm512_shuffle_epi8(mask2, index2);
-  __m512i mask3 =
-      _mm512_setr_epi64(0x101010101010101, 0x1010101babaaee6, 0x101010101010101,
-                        0x1010101babaaee6, 0x101010101010101, 0x1010101babaaee6,
-                        0x101010101010101, 0x1010101babaaee6);
-  __m512i index3 = _mm512_and_si512(_mm512_srli_epi16(input, 4), v_0f);
-  __m512i byte_2_high = _mm512_shuffle_epi8(mask3, index3);
-  return _mm512_ternarylogic_epi64(byte_1_high, byte_1_low, byte_2_high, 128);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  // optimization opportunity: implement a custom function
+  return convert_utf16be_to_latin1(buf, len, latin1_output);
 }
 
-simdutf_really_inline __m512i check_multibyte_lengths(const __m512i input,
-                                                      const __m512i prev_input,
-                                                      const __m512i sc) {
-  __m512i prev2 = prev<2>(input, prev_input);
-  __m512i prev3 = prev<3>(input, prev_input);
-  __m512i is_third_byte = _mm512_subs_epu8(
-      prev2, _mm512_set1_epi8(0b11100000u - 1)); // Only 111_____ will be > 0
-  __m512i is_fourth_byte = _mm512_subs_epu8(
-      prev3, _mm512_set1_epi8(0b11110000u - 1)); // Only 1111____ will be > 0
-  __m512i is_third_or_fourth_byte =
-      _mm512_or_si512(is_third_byte, is_fourth_byte);
-  const __m512i v_7f = _mm512_set1_epi8(char(0x7f));
-  is_third_or_fourth_byte = _mm512_adds_epu8(v_7f, is_third_or_fourth_byte);
-  // We want to compute (is_third_or_fourth_byte AND v80) XOR sc.
-  const __m512i v_80 = _mm512_set1_epi8(char(0x80));
-  return _mm512_ternarylogic_epi32(is_third_or_fourth_byte, v_80, sc,
-                                   0b1101010);
-  //__m512i is_third_or_fourth_byte_mask =
-  //_mm512_and_si512(is_third_or_fourth_byte, v_80); return
-  // _mm512_xor_si512(is_third_or_fourth_byte_mask, sc);
-}
-//
-// Return nonzero if there are incomplete multibyte characters at the end of the
-// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
-//
-simdutf_really_inline __m512i is_incomplete(const __m512i input) {
-  // If the previous input's last 3 bytes match this, they're too short (they
-  // ended at EOF):
-  // ... 1111____ 111_____ 11______
-  __m512i max_value = _mm512_setr_epi64(0xffffffffffffffff, 0xffffffffffffffff,
-                                        0xffffffffffffffff, 0xffffffffffffffff,
-                                        0xffffffffffffffff, 0xffffffffffffffff,
-                                        0xffffffffffffffff, 0xbfdfefffffffffff);
-  return _mm512_subs_epu8(input, max_value);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  // optimization opportunity: implement a custom function
+  return convert_utf16le_to_latin1(buf, len, latin1_output);
 }
 
-struct avx512_utf8_checker {
-  // If this is nonzero, there has been a UTF-8 error.
-  __m512i error{};
-
-  // The last input we received
-  __m512i prev_input_block{};
-  // Whether the last input we received was incomplete (used for ASCII fast
-  // path)
-  __m512i prev_incomplete{};
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      haswell::avx2_convert_utf16_to_utf8<endianness::LITTLE>(buf, len,
+                                                              utf8_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf8::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-  //
-  // Check whether the current bytes are valid UTF-8.
-  //
-  simdutf_really_inline void check_utf8_bytes(const __m512i input,
-                                              const __m512i prev_input) {
-    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
-    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
-    // small negative numbers)
-    __m512i prev1 = prev<1>(input, prev_input);
-    __m512i sc = check_special_cases(input, prev1);
-    this->error = _mm512_or_si512(
-        check_multibyte_lengths(input, prev_input, sc), this->error);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      haswell::avx2_convert_utf16_to_utf8<endianness::BIG>(buf, len,
+                                                           utf8_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf8::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
   }
+  return saved_bytes;
+}
 
-  // The only problem that can happen at EOF is that a multibyte character is
-  // too short or a byte value too large in the last bytes: check_special_cases
-  // only checks for bytes too large in the first of two bytes.
-  simdutf_really_inline void check_eof() {
-    // If the previous block had incomplete UTF-8 characters at the end, an
-    // ASCII block can't possibly finish them.
-    this->error = _mm512_or_si512(this->error, this->prev_incomplete);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char *> ret =
+      haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(
+          buf, len, utf8_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
   }
+  ret.first.count =
+      ret.second -
+      utf8_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-  // returns true if ASCII.
-  simdutf_really_inline bool check_next_input(const __m512i input) {
-    const __m512i v_80 = _mm512_set1_epi8(char(0x80));
-    const __mmask64 ascii = _mm512_test_epi8_mask(input, v_80);
-    if (ascii == 0) {
-      this->error = _mm512_or_si512(this->error, this->prev_incomplete);
-      return true;
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char *> ret =
+      haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::BIG>(
+          buf, len, utf8_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
     } else {
-      this->check_utf8_bytes(input, this->prev_input_block);
-      this->prev_incomplete = is_incomplete(input);
-      this->prev_input_block = input;
-      return false;
+      ret.second += scalar_res.count;
     }
   }
-  // do not forget to call check_eof!
-  simdutf_really_inline bool errors() const {
-    return _mm512_test_epi8_mask(this->error, this->error) != 0;
-  }
-}; // struct avx512_utf8_checker
-/* end file src/icelake/icelake_utf8_validation.inl.cpp */
-/* begin file src/icelake/icelake_from_utf8.inl.cpp */
-// file included directly
+  ret.first.count =
+      ret.second -
+      utf8_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-// File contains conversion procedure from possibly invalid UTF-8 strings.
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return convert_utf16le_to_utf8(buf, len, utf8_output);
+}
 
-/**
- * Attempts to convert up to len 1-byte code units from in (in UTF-8 format) to
- * out.
- * Returns the position of the input and output after the processing is
- * completed. Upon error, the output is set to null.
- */
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return convert_utf16be_to_utf8(buf, len, utf8_output);
+}
 
-template <endianness big_endian>
-utf8_to_utf16_result
-fast_avx512_convert_utf8_to_utf16(const char *in, size_t len, char16_t *out) {
-  const char *const final_in = in + len;
-  bool result = true;
-  while (result) {
-    if (final_in - in >= 64) {
-      result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(
-          in, out, final_in - in);
-    } else if (in < final_in) {
-      result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(
-          in, out, final_in - in);
-    } else {
-      break;
-    }
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char32_t *, char *> ret =
+      avx2_convert_utf32_to_utf8(buf, len, utf8_output);
+  if (ret.first == nullptr) {
+    return 0;
   }
-  if (!result) {
-    out = nullptr;
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
   }
-  return std::make_pair(in, out);
+  return saved_bytes;
 }
 
-template <endianness big_endian>
-simdutf::result fast_avx512_convert_utf8_to_utf16_with_errors(const char *in,
-                                                              size_t len,
-                                                              char16_t *out) {
-  const char *const init_in = in;
-  const char16_t *const init_out = out;
-  const char *const final_in = in + len;
-  bool result = true;
-  while (result) {
-    if (final_in - in >= 64) {
-      result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(
-          in, out, final_in - in);
-    } else if (in < final_in) {
-      result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(
-          in, out, final_in - in);
-    } else {
-      break;
-    }
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char32_t *, char *> ret =
+      avx2_convert_utf32_to_latin1(buf, len, latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
   }
-  if (!result) {
-    size_t pos = size_t(in - init_in);
-    if (pos < len && (init_in[pos] & 0xc0) == 0x80 && pos >= 64) {
-      // We must check whether we are the fourth continuation byte
-      bool c1 = (init_in[pos - 1] & 0xc0) == 0x80;
-      bool c2 = (init_in[pos - 2] & 0xc0) == 0x80;
-      bool c3 = (init_in[pos - 3] & 0xc0) == 0x80;
-      if (c1 && c2 && c3) {
-        return {simdutf::TOO_LONG, pos};
-      }
+  size_t saved_bytes = ret.second - latin1_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
-    // rewind_and_convert_with_errors will seek a potential error from in
-    // onward, with the ability to go back up to in - init_in bytes, and read
-    // final_in - in bytes forward.
-    simdutf::result res =
-        scalar::utf8_to_utf16::rewind_and_convert_with_errors<big_endian>(
-            in - init_in, in, final_in - in, out);
-    res.count += (in - init_in);
-    return res;
-  } else {
-    return simdutf::result(error_code::SUCCESS, out - init_out);
+    saved_bytes += scalar_saved_bytes;
   }
+  return saved_bytes;
 }
 
-template <endianness big_endian, typename OUTPUT>
-// todo: replace with the utf-8 to utf-16 routine adapted to utf-32. This code
-// is legacy.
-std::pair<const char *, OUTPUT *>
-validating_utf8_to_fixed_length(const char *str, size_t len, OUTPUT *dwords) {
-  constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
-  constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
-  static_assert(
-      UTF32 or UTF16,
-      "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
-  static_assert(!(UTF32 and big_endian),
-                "we do not currently support big-endian UTF-32");
-
-  const char *ptr = str;
-  const char *end = ptr + len;
-  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  OUTPUT *output = dwords;
-  avx512_utf8_checker checker{};
-  /**
-   * In the main loop, we consume 64 bytes per iteration,
-   * but we access 64 + 4 bytes.
-   * We use masked writes to avoid overruns, see
-   * https://github.com/simdutf/simdutf/issues/471
-   */
-  while (end - ptr >= 64 + 4) {
-    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
-    if (checker.check_next_input(utf8)) {
-      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
-      output += 64;
-      ptr += 64;
-      continue;
-    }
-    const __m512i lane0 = broadcast_epi128<0>(utf8);
-    const __m512i lane1 = broadcast_epi128<1>(utf8);
-    int valid_count0;
-    __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
-    const __m512i lane2 = broadcast_epi128<2>(utf8);
-    int valid_count1;
-    __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-    if (valid_count0 + valid_count1 <= 16) {
-      vec0 = _mm512_mask_expand_epi32(
-          vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
-      valid_count0 += valid_count1;
-      vec0 = expand_utf8_to_utf32(vec0);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-    } else {
-      vec0 = expand_utf8_to_utf32(vec0);
-      vec1 = expand_utf8_to_utf32(vec1);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
-    }
-    const __m512i lane3 = broadcast_epi128<3>(utf8);
-    int valid_count2;
-    __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
-    uint32_t tmp1;
-    ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
-    const __m512i lane4 = _mm512_set1_epi32(tmp1);
-    int valid_count3;
-    __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
-    if (valid_count2 + valid_count3 <= 16) {
-      vec2 = _mm512_mask_expand_epi32(
-          vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
-      valid_count2 += valid_count3;
-      vec2 = expand_utf8_to_utf32(vec2);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char *> ret =
+      avx2_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
+  if (ret.first.count != len) {
+    result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
+        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
     } else {
-      vec2 = expand_utf8_to_utf32(vec2);
-      vec3 = expand_utf8_to_utf32(vec3);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
+      ret.second += scalar_res.count;
     }
-    ptr += 4 * 16;
   }
-  const char *validatedptr = ptr; // validated up to ptr
-
-  // For the final pass, we validate 64 bytes, but we only transcode
-  // 3*16 bytes, so we may end up double-validating 16 bytes.
-  if (end - ptr >= 64) {
-    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
-    if (checker.check_next_input(utf8)) {
-      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
-      output += 64;
-      ptr += 64;
-    } else {
-      const __m512i lane0 = broadcast_epi128<0>(utf8);
-      const __m512i lane1 = broadcast_epi128<1>(utf8);
-      int valid_count0;
-      __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
-      const __m512i lane2 = broadcast_epi128<2>(utf8);
-      int valid_count1;
-      __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-      if (valid_count0 + valid_count1 <= 16) {
-        vec0 = _mm512_mask_expand_epi32(
-            vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
-        valid_count0 += valid_count1;
-        vec0 = expand_utf8_to_utf32(vec0);
-        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-      } else {
-        vec0 = expand_utf8_to_utf32(vec0);
-        vec1 = expand_utf8_to_utf32(vec1);
-        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
-      }
+  ret.first.count =
+      ret.second -
+      latin1_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-      const __m512i lane3 = broadcast_epi128<3>(utf8);
-      SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  return convert_utf32_to_latin1(buf, len, latin1_output);
+}
 
-      ptr += 3 * 16;
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char *> ret =
+      haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+  if (ret.first.count != len) {
+    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
     }
-    validatedptr += 4 * 16;
   }
-  if (end != validatedptr) {
-    const __m512i utf8 =
-        _mm512_maskz_loadu_epi8(~UINT64_C(0) >> (64 - (end - validatedptr)),
-                                (const __m512i *)validatedptr);
-    checker.check_next_input(utf8);
+  ret.first.count =
+      ret.second -
+      utf8_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::pair<const char16_t *, char32_t *> ret =
+      haswell::avx2_convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
+                                                               utf32_output);
+  if (ret.first == nullptr) {
+    return 0;
   }
-  checker.check_eof();
-  if (checker.errors()) {
-    return {ptr, nullptr}; // We found an error.
+  size_t saved_bytes = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
   }
-  return {ptr, output};
+  return saved_bytes;
 }
 
-// Like validating_utf8_to_fixed_length but returns as soon as an error is
-// identified todo: replace with the utf-8 to utf-16 routine adapted to utf-32.
-// This code is legacy.
-template <endianness big_endian, typename OUTPUT>
-std::tuple<const char *, OUTPUT *, bool>
-validating_utf8_to_fixed_length_with_constant_checks(const char *str,
-                                                     size_t len,
-                                                     OUTPUT *dwords) {
-  constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
-  constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
-  static_assert(
-      UTF32 or UTF16,
-      "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
-  static_assert(!(UTF32 and big_endian),
-                "we do not currently support big-endian UTF-32");
-
-  const char *ptr = str;
-  const char *end = ptr + len;
-  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  OUTPUT *output = dwords;
-  avx512_utf8_checker checker{};
-  /**
-   * In the main loop, we consume 64 bytes per iteration,
-   * but we access 64 + 4 bytes.
-   */
-  while (end - ptr >= 4 + 64) {
-    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
-    bool ascii = checker.check_next_input(utf8);
-    if (checker.errors()) {
-      return {ptr, output, false}; // We found an error.
-    }
-    if (ascii) {
-      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
-      output += 64;
-      ptr += 64;
-      continue;
-    }
-    const __m512i lane0 = broadcast_epi128<0>(utf8);
-    const __m512i lane1 = broadcast_epi128<1>(utf8);
-    int valid_count0;
-    __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
-    const __m512i lane2 = broadcast_epi128<2>(utf8);
-    int valid_count1;
-    __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-    if (valid_count0 + valid_count1 <= 16) {
-      vec0 = _mm512_mask_expand_epi32(
-          vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
-      valid_count0 += valid_count1;
-      vec0 = expand_utf8_to_utf32(vec0);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-    } else {
-      vec0 = expand_utf8_to_utf32(vec0);
-      vec1 = expand_utf8_to_utf32(vec1);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
-    }
-    const __m512i lane3 = broadcast_epi128<3>(utf8);
-    int valid_count2;
-    __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
-    uint32_t tmp1;
-    ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
-    const __m512i lane4 = _mm512_set1_epi32(tmp1);
-    int valid_count3;
-    __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
-    if (valid_count2 + valid_count3 <= 16) {
-      vec2 = _mm512_mask_expand_epi32(
-          vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
-      valid_count2 += valid_count3;
-      vec2 = expand_utf8_to_utf32(vec2);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
-    } else {
-      vec2 = expand_utf8_to_utf32(vec2);
-      vec3 = expand_utf8_to_utf32(vec3);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::pair<const char16_t *, char32_t *> ret =
+      haswell::avx2_convert_utf16_to_utf32<endianness::BIG>(buf, len,
+                                                            utf32_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf32::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
-    ptr += 4 * 16;
+    saved_bytes += scalar_saved_bytes;
   }
-  const char *validatedptr = ptr; // validated up to ptr
+  return saved_bytes;
+}
 
-  // For the final pass, we validate 64 bytes, but we only transcode
-  // 3*16 bytes, so we may end up double-validating 16 bytes.
-  if (end - ptr >= 64) {
-    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
-    bool ascii = checker.check_next_input(utf8);
-    if (checker.errors()) {
-      return {ptr, output, false}; // We found an error.
-    }
-    if (ascii) {
-      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
-      output += 64;
-      ptr += 64;
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char32_t *> ret =
+      haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(
+          buf, len, utf32_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
     } else {
-      const __m512i lane0 = broadcast_epi128<0>(utf8);
-      const __m512i lane1 = broadcast_epi128<1>(utf8);
-      int valid_count0;
-      __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
-      const __m512i lane2 = broadcast_epi128<2>(utf8);
-      int valid_count1;
-      __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-      if (valid_count0 + valid_count1 <= 16) {
-        vec0 = _mm512_mask_expand_epi32(
-            vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
-        valid_count0 += valid_count1;
-        vec0 = expand_utf8_to_utf32(vec0);
-        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-      } else {
-        vec0 = expand_utf8_to_utf32(vec0);
-        vec1 = expand_utf8_to_utf32(vec1);
-        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
-      }
-
-      const __m512i lane3 = broadcast_epi128<3>(utf8);
-      SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf32_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-      ptr += 3 * 16;
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char32_t *> ret =
+      haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::BIG>(
+          buf, len, utf32_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
     }
-    validatedptr += 4 * 16;
-  }
-  if (end != validatedptr) {
-    const __m512i utf8 =
-        _mm512_maskz_loadu_epi8(~UINT64_C(0) >> (64 - (end - validatedptr)),
-                                (const __m512i *)validatedptr);
-    checker.check_next_input(utf8);
-  }
-  checker.check_eof();
-  if (checker.errors()) {
-    return {ptr, output, false}; // We found an error.
   }
-  return {ptr, output, true};
+  ret.first.count =
+      ret.second -
+      utf32_output; // Set count to the number of 8-bit code units written
+  return ret.first;
 }
-/* end file src/icelake/icelake_from_utf8.inl.cpp */
-/* begin file src/icelake/icelake_convert_utf8_to_latin1.inl.cpp */
-// file included directly
 
-// File contains conversion procedure from possibly invalid UTF-8 strings.
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  return convert_utf32_to_utf8(buf, len, utf8_output);
+}
 
-template <bool is_remaining>
-simdutf_really_inline size_t process_block_from_utf8_to_latin1(
-    const char *buf, size_t len, char *latin_output, __m512i minus64,
-    __m512i one, __mmask64 *next_leading_ptr, __mmask64 *next_bit6_ptr) {
-  __mmask64 load_mask =
-      is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL;
-  __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf);
-  __mmask64 nonascii = _mm512_movepi8_mask(input);
-  if (nonascii == 0) {
-    if (*next_leading_ptr) { // If we ended with a leading byte, it is an error.
-      return 0;              // Indicates error
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char32_t *, char16_t *> ret =
+      avx2_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
-    is_remaining
-        ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input)
-        : _mm512_storeu_si512((__m512i *)latin_output, input);
-    return len;
+    saved_bytes += scalar_saved_bytes;
   }
+  return saved_bytes;
+}
 
-  const __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char32_t *, char16_t *> ret =
+      avx2_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf32_to_utf16::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-  __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62));
-  __mmask64 invalid_leading_bytes =
-      _mm512_mask_cmpgt_epu8_mask(leading, highbits, one);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char16_t *> ret =
+      haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(
+          buf, len, utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res =
+        scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf16_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-  if (invalid_leading_bytes) {
-    return 0; // Indicates error
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char16_t *> ret =
+      haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::BIG>(
+          buf, len, utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res =
+        scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
   }
+  ret.first.count =
+      ret.second -
+      utf16_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-  __mmask64 leading_shift = (leading << 1) | *next_leading_ptr;
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return convert_utf32_to_utf16le(buf, len, utf16_output);
+}
 
-  if ((nonascii ^ leading) != leading_shift) {
-    return 0; // Indicates error
-  }
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return convert_utf32_to_utf16be(buf, len, utf16_output);
+}
 
-  const __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one);
-  input =
-      _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return convert_utf16le_to_utf32(buf, len, utf32_output);
+}
 
-  __mmask64 retain = ~leading & load_mask;
-  __m512i output = _mm512_maskz_compress_epi8(retain, input);
-  int64_t written_out = count_ones(retain);
-  if (written_out == 0) {
-    return 0; // Indicates error
-  }
-  *next_bit6_ptr = bit6 >> 63;
-  *next_leading_ptr = leading >> 63;
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return convert_utf16be_to_utf32(buf, len, utf32_output);
+}
 
-  __mmask64 store_mask = ~UINT64_C(0) >> (64 - written_out);
+void implementation::change_endianness_utf16(const char16_t *input,
+                                             size_t length,
+                                             char16_t *output) const noexcept {
+  utf16::change_endianness_utf16(input, length, output);
+}
 
-  _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output);
+simdutf_warn_unused size_t implementation::count_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::count_code_points<endianness::LITTLE>(input, length);
+}
 
-  return written_out;
+simdutf_warn_unused size_t implementation::count_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::count_code_points<endianness::BIG>(input, length);
 }
 
-size_t utf8_to_latin1_avx512(const char *&inbuf, size_t len,
-                             char *&inlatin_output) {
-  const char *buf = inbuf;
-  char *latin_output = inlatin_output;
-  char *start = latin_output;
-  size_t pos = 0;
-  __m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000
-  __m512i one = _mm512_set1_epi8(1);
-  __mmask64 next_leading = 0;
-  __mmask64 next_bit6 = 0;
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *input, size_t length) const noexcept {
+  return utf8::count_code_points(input, length);
+}
 
-  while (pos + 64 <= len) {
-    size_t written = process_block_from_utf8_to_latin1<false>(
-        buf + pos, 64, latin_output, minus64, one, &next_leading, &next_bit6);
-    if (written == 0) {
-      inlatin_output = latin_output;
-      inbuf = buf + pos - next_leading;
-      return 0; // Indicates error at pos or after, or just before pos (too
-                // short error)
-    }
-    latin_output += written;
-    pos += 64;
-  }
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
+    const char *buf, size_t len) const noexcept {
+  return count_utf8(buf, len);
+}
 
-  if (pos < len) {
-    size_t remaining = len - pos;
-    size_t written = process_block_from_utf8_to_latin1<true>(
-        buf + pos, remaining, latin_output, minus64, one, &next_leading,
-        &next_bit6);
-    if (written == 0) {
-      inbuf = buf + pos - next_leading;
-      inlatin_output = latin_output;
-      return 0; // Indicates error at pos or after, or just before pos (too
-                // short error)
-    }
-    latin_output += written;
-  }
-  if (next_leading) {
-    inbuf = buf + len - next_leading;
-    inlatin_output = latin_output;
-    return 0; // Indicates error at end of buffer
-  }
-  inlatin_output = latin_output;
-  inbuf += len;
-  return size_t(latin_output - start);
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf16(size_t length) const noexcept {
+  return scalar::utf16::latin1_length_from_utf16(length);
 }
-/* end file src/icelake/icelake_convert_utf8_to_latin1.inl.cpp */
-/* begin file src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp */
-// file included directly
 
-// File contains conversion procedure from valid UTF-8 strings.
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf32(size_t length) const noexcept {
+  return scalar::utf32::latin1_length_from_utf32(length);
+}
 
-template <bool is_remaining>
-simdutf_really_inline size_t process_valid_block_from_utf8_to_latin1(
-    const char *buf, size_t len, char *latin_output, __m512i minus64,
-    __m512i one, __mmask64 *next_leading_ptr, __mmask64 *next_bit6_ptr) {
-  __mmask64 load_mask =
-      is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL;
-  __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf);
-  __mmask64 nonascii = _mm512_movepi8_mask(input);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+}
 
-  if (nonascii == 0) {
-    is_remaining
-        ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input)
-        : _mm512_storeu_si512((__m512i *)latin_output, input);
-    return len;
-  }
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+}
 
-  __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+}
 
-  __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62));
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
 
-  *next_leading_ptr = leading >> 63;
+simdutf_warn_unused size_t
+implementation::utf16_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf16_length_from_latin1(length);
+}
 
-  __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one);
-  input =
-      _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64);
-  *next_bit6_ptr = bit6 >> 63;
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return utf8::utf16_length_from_utf8(input, length);
+}
 
-  __mmask64 retain = ~leading & load_mask;
-  __m512i output = _mm512_maskz_compress_epi8(retain, input);
-  int64_t written_out = count_ones(retain);
-  if (written_out == 0) {
-    return 0; // Indicates error
+simdutf_warn_unused size_t
+implementation::utf32_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf32_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+    const char *input, size_t len) const noexcept {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
+  size_t answer = len / sizeof(__m256i) * sizeof(__m256i);
+  size_t i = 0;
+  if (answer >= 2048) { // long strings optimization
+    __m256i four_64bits = _mm256_setzero_si256();
+    while (i + sizeof(__m256i) <= len) {
+      __m256i runner = _mm256_setzero_si256();
+      // We can do up to 255 loops without overflow.
+      size_t iterations = (len - i) / sizeof(__m256i);
+      if (iterations > 255) {
+        iterations = 255;
+      }
+      size_t max_i = i + iterations * sizeof(__m256i) - sizeof(__m256i);
+      for (; i + 4 * sizeof(__m256i) <= max_i; i += 4 * sizeof(__m256i)) {
+        __m256i input1 = _mm256_loadu_si256((const __m256i *)(data + i));
+        __m256i input2 =
+            _mm256_loadu_si256((const __m256i *)(data + i + sizeof(__m256i)));
+        __m256i input3 = _mm256_loadu_si256(
+            (const __m256i *)(data + i + 2 * sizeof(__m256i)));
+        __m256i input4 = _mm256_loadu_si256(
+            (const __m256i *)(data + i + 3 * sizeof(__m256i)));
+        __m256i input12 =
+            _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input1),
+                            _mm256_cmpgt_epi8(_mm256_setzero_si256(), input2));
+        __m256i input23 =
+            _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input3),
+                            _mm256_cmpgt_epi8(_mm256_setzero_si256(), input4));
+        __m256i input1234 = _mm256_add_epi8(input12, input23);
+        runner = _mm256_sub_epi8(runner, input1234);
+      }
+      for (; i <= max_i; i += sizeof(__m256i)) {
+        __m256i input_256_chunk =
+            _mm256_loadu_si256((const __m256i *)(data + i));
+        runner = _mm256_sub_epi8(
+            runner, _mm256_cmpgt_epi8(_mm256_setzero_si256(), input_256_chunk));
+      }
+      four_64bits = _mm256_add_epi64(
+          four_64bits, _mm256_sad_epu8(runner, _mm256_setzero_si256()));
+    }
+    answer += _mm256_extract_epi64(four_64bits, 0) +
+              _mm256_extract_epi64(four_64bits, 1) +
+              _mm256_extract_epi64(four_64bits, 2) +
+              _mm256_extract_epi64(four_64bits, 3);
+  } else if (answer > 0) {
+    for (; i + sizeof(__m256i) <= len; i += sizeof(__m256i)) {
+      __m256i latin = _mm256_loadu_si256((const __m256i *)(data + i));
+      uint32_t non_ascii = _mm256_movemask_epi8(latin);
+      answer += count_ones(non_ascii);
+    }
   }
-  __mmask64 store_mask = ~UINT64_C(0) >> (64 - written_out);
-  // Optimization opportunity: sometimes, masked writes are not needed.
-  _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output);
-  return written_out;
+  return answer + scalar::latin1::utf8_length_from_latin1(
+                      reinterpret_cast<const char *>(data + i), len - i);
 }
 
-size_t valid_utf8_to_latin1_avx512(const char *buf, size_t len,
-                                   char *latin_output) {
-  char *start = latin_output;
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  const __m256i v_00000000 = _mm256_setzero_si256();
+  const __m256i v_ffffff80 = _mm256_set1_epi32((uint32_t)0xffffff80);
+  const __m256i v_fffff800 = _mm256_set1_epi32((uint32_t)0xfffff800);
+  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
   size_t pos = 0;
-  __m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000
-  __m512i one = _mm512_set1_epi8(1);
-  __mmask64 next_leading = 0;
-  __mmask64 next_bit6 = 0;
+  size_t count = 0;
+  for (; pos + 8 <= length; pos += 8) {
+    __m256i in = _mm256_loadu_si256((__m256i *)(input + pos));
+    const __m256i ascii_bytes_bytemask =
+        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffffff80), v_00000000);
+    const __m256i one_two_bytes_bytemask =
+        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_fffff800), v_00000000);
+    const __m256i two_bytes_bytemask =
+        _mm256_xor_si256(one_two_bytes_bytemask, ascii_bytes_bytemask);
+    const __m256i one_two_three_bytes_bytemask =
+        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+    const __m256i three_bytes_bytemask =
+        _mm256_xor_si256(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
+    const uint32_t ascii_bytes_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(ascii_bytes_bytemask));
+    const uint32_t two_bytes_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(two_bytes_bytemask));
+    const uint32_t three_bytes_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(three_bytes_bytemask));
 
-  while (pos + 64 <= len) {
-    size_t written = process_valid_block_from_utf8_to_latin1<false>(
-        buf + pos, 64, latin_output, minus64, one, &next_leading, &next_bit6);
-    latin_output += written;
-    pos += 64;
+    size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
+    size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
+    size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
+    count += 32 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
   }
+  return count +
+         scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+}
 
-  if (pos < len) {
-    size_t remaining = len - pos;
-    size_t written = process_valid_block_from_utf8_to_latin1<true>(
-        buf + pos, remaining, latin_output, minus64, one, &next_leading,
-        &next_bit6);
-    latin_output += written;
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  const __m256i v_00000000 = _mm256_setzero_si256();
+  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos + 8 <= length; pos += 8) {
+    __m256i in = _mm256_loadu_si256((__m256i *)(input + pos));
+    const __m256i surrogate_bytemask =
+        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+    const uint32_t surrogate_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(surrogate_bytemask));
+    size_t surrogate_count = (32 - count_ones(surrogate_bitmask)) / 4;
+    count += 8 + surrogate_count;
   }
+  return count +
+         scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+}
 
-  return (size_t)(latin_output - start);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return utf8::count_code_points(input, length);
 }
-/* end file src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp */
-/* begin file src/icelake/icelake_convert_utf16_to_latin1.inl.cpp */
-// file included directly
-template <endianness big_endian>
-size_t icelake_convert_utf16_to_latin1(const char16_t *buf, size_t len,
-                                       char *latin1_output) {
-  const char16_t *end = buf + len;
-  __m512i v_0xFF = _mm512_set1_epi16(0xff);
-  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  __m512i shufmask = _mm512_set_epi8(
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38,
-      36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
-  while (end - buf >= 32) {
-    __m512i in = _mm512_loadu_si512((__m512i *)buf);
-    if (big_endian) {
-      in = _mm512_shuffle_epi8(in, byteflip);
-    }
-    if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
-      return 0;
-    }
-    _mm256_storeu_si256(
-        (__m256i *)latin1_output,
-        _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
-    latin1_output += 32;
-    buf += 32;
-  }
-  if (buf < end) {
-    uint32_t mask(uint32_t(1 << (end - buf)) - 1);
-    __m512i in = _mm512_maskz_loadu_epi16(mask, buf);
-    if (big_endian) {
-      in = _mm512_shuffle_epi8(in, byteflip);
-    }
-    if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
-      return 0;
-    }
-    _mm256_mask_storeu_epi8(
-        latin1_output, mask,
-        _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
-  }
-  return len;
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+    const char *input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+    const char *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+    const char *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
 }
 
-template <endianness big_endian>
-std::pair<result, char *>
-icelake_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
-                                            char *latin1_output) {
-  const char16_t *end = buf + len;
-  const char16_t *start = buf;
-  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  __m512i v_0xFF = _mm512_set1_epi16(0xff);
-  __m512i shufmask = _mm512_set_epi8(
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38,
-      36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
-  while (end - buf >= 32) {
-    __m512i in = _mm512_loadu_si512((__m512i *)buf);
-    if (big_endian) {
-      in = _mm512_shuffle_epi8(in, byteflip);
-    }
-    if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
-      uint16_t word;
-      while ((word = (big_endian ? scalar::utf16::swap_bytes(uint16_t(*buf))
-                                 : uint16_t(*buf))) <= 0xff) {
-        *latin1_output++ = uint8_t(word);
-        buf++;
-      }
-      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
-                            latin1_output);
-    }
-    _mm256_storeu_si256(
-        (__m256i *)latin1_output,
-        _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
-    latin1_output += 32;
-    buf += 32;
-  }
-  if (buf < end) {
-    uint32_t mask(uint32_t(1 << (end - buf)) - 1);
-    __m512i in = _mm512_maskz_loadu_epi16(mask, buf);
-    if (big_endian) {
-      in = _mm512_shuffle_epi8(in, byteflip);
-    }
-    if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
+}
 
-      uint16_t word;
-      while ((word = (big_endian ? scalar::utf16::swap_bytes(uint16_t(*buf))
-                                 : uint16_t(*buf))) <= 0xff) {
-        *latin1_output++ = uint8_t(word);
-        buf++;
-      }
-      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
-                            latin1_output);
-    }
-    _mm256_mask_storeu_epi8(
-        latin1_output, mask,
-        _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
+simdutf_warn_unused size_t implementation::base64_length_from_binary(
+    size_t length, base64_options options) const noexcept {
+  return scalar::base64::base64_length_from_binary(length, options);
+}
+
+size_t implementation::binary_to_base64(const char *input, size_t length,
+                                        char *output,
+                                        base64_options options) const noexcept {
+  if (options & base64_url) {
+    return encode_base64<true>(output, input, length, options);
+  } else {
+    return encode_base64<false>(output, input, length, options);
   }
-  return std::make_pair(result(error_code::SUCCESS, len), latin1_output);
 }
-/* end file src/icelake/icelake_convert_utf16_to_latin1.inl.cpp */
-/* begin file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */
-// file included directly
+} // namespace haswell
+} // namespace simdutf
 
-/**
- * This function converts the input (inbuf, inlen), assumed to be valid
- * UTF16 (little endian) into UTF-8 (to outbuf). The number of code units
- * written is written to 'outlen' and the function reports the number of input
- * word consumed.
- */
-template <endianness big_endian>
-size_t utf16_to_utf8_avx512i(const char16_t *inbuf, size_t inlen,
-                             unsigned char *outbuf, size_t *outlen) {
-  __m512i in;
-  __mmask32 inmask = _cvtu32_mask32(0x7fffffff);
-  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  const char16_t *const inbuf_orig = inbuf;
-  const unsigned char *const outbuf_orig = outbuf;
-  int adjust = 0;
-  int carry = 0;
+/* begin file src/simdutf/haswell/end.h */
+#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
+// nothing needed.
+#else
+SIMDUTF_UNTARGET_REGION
+#endif
 
-  while (inlen >= 32) {
-    in = _mm512_loadu_si512(inbuf);
-    if (big_endian) {
-      in = _mm512_shuffle_epi8(in, byteflip);
-    }
-    inlen -= 31;
-  lastiteration:
-    inbuf += 31;
 
-  failiteration:
-    const __mmask32 is234byte = _mm512_mask_cmp_epu16_mask(
-        inmask, in, _mm512_set1_epi16(0x0080), _MM_CMPINT_NLT);
+#if SIMDUTF_GCC11ORMORE // workaround for
+                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
+SIMDUTF_POP_DISABLE_WARNINGS
+#endif // end of workaround
+/* end file src/simdutf/haswell/end.h */
+/* end file src/haswell/implementation.cpp */
+#endif
+#if SIMDUTF_IMPLEMENTATION_PPC64
+/* begin file src/ppc64/implementation.cpp */
 
-    if (_ktestz_mask32_u8(inmask, is234byte)) {
-      // fast path for ASCII only
-      _mm512_mask_cvtepi16_storeu_epi8(outbuf, inmask, in);
-      outbuf += 31;
-      carry = 0;
 
-      if (inlen < 32) {
-        goto tail;
-      } else {
-        continue;
-      }
-    }
 
-    const __mmask32 is12byte =
-        _mm512_cmp_epu16_mask(in, _mm512_set1_epi16(0x0800), _MM_CMPINT_LT);
 
-    if (_ktestc_mask32_u8(is12byte, inmask)) {
-      // fast path for 1 and 2 byte only
 
-      const __m512i twobytes = _mm512_ternarylogic_epi32(
-          _mm512_slli_epi16(in, 8), _mm512_srli_epi16(in, 6),
-          _mm512_set1_epi16(0x3f3f), 0xa8); // (A|B)&C
-      in = _mm512_mask_add_epi16(in, is234byte, twobytes,
-                                 _mm512_set1_epi16(int16_t(0x80c0)));
-      const __m512i cmpmask =
-          _mm512_mask_blend_epi16(inmask, _mm512_set1_epi16(int16_t(0xffff)),
-                                  _mm512_set1_epi16(0x0800));
-      const __mmask64 smoosh =
-          _mm512_cmp_epu8_mask(in, cmpmask, _MM_CMPINT_NLT);
-      const __m512i out = _mm512_maskz_compress_epi8(smoosh, in);
-      _mm512_mask_storeu_epi8(outbuf,
-                              _cvtu64_mask64(_pext_u64(_cvtmask64_u64(smoosh),
-                                                       _cvtmask64_u64(smoosh))),
-                              out);
-      outbuf += 31 + _mm_popcnt_u32(_cvtmask32_u32(is234byte));
-      carry = 0;
+/* begin file src/simdutf/ppc64/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "ppc64"
+// #define SIMDUTF_IMPLEMENTATION ppc64
+/* end file src/simdutf/ppc64/begin.h */
+namespace simdutf {
+namespace ppc64 {
+namespace {
+#ifndef SIMDUTF_PPC64_H
+  #error "ppc64.h must be included"
+#endif
+using namespace simd;
 
-      if (inlen < 32) {
-        goto tail;
-      } else {
-        continue;
-      }
-    }
-    __m512i lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
-    __m512i hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
+  // careful: 0x80 is not ascii.
+  return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere();
+}
 
-    __m512i taglo = _mm512_set1_epi32(0x8080e000);
-    __m512i taghi = taglo;
+simdutf_unused simdutf_really_inline simd8<bool>
+must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2,
+                     const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_second_byte =
+      prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
+  simd8<uint8_t> is_third_byte =
+      prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte =
+      prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction
+  // will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) >
+         int8_t(0);
+}
 
-    const __m512i fc00masked =
-        _mm512_and_epi32(in, _mm512_set1_epi16(int16_t(0xfc00)));
-    const __mmask32 hisurr = _mm512_mask_cmp_epu16_mask(
-        inmask, fc00masked, _mm512_set1_epi16(int16_t(0xd800)), _MM_CMPINT_EQ);
-    const __mmask32 losurr = _mm512_cmp_epu16_mask(
-        fc00masked, _mm512_set1_epi16(int16_t(0xdc00)), _MM_CMPINT_EQ);
+simdutf_really_inline simd8<bool>
+must_be_2_3_continuation(const simd8<uint8_t> prev2,
+                         const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_third_byte =
+      prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be >= 0x80
+  simd8<uint8_t> is_fourth_byte =
+      prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be >= 0x80
+  // Caller requires a bool (all 1's). All values resulting from the subtraction
+  // will be <= 64, so signed comparison is fine.
+  return simd8<bool>(is_third_byte | is_fourth_byte);
+}
 
-    int carryout = 0;
-    if (!_kortestz_mask32_u8(hisurr, losurr)) {
-      // handle surrogates
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
 
-      __m512i los = _mm512_alignr_epi32(hi, lo, 1);
-      __m512i his = _mm512_alignr_epi32(lo, hi, 1);
+/* begin file src/generic/buf_block_reader.h */
+namespace simdutf {
+namespace ppc64 {
+namespace {
 
-      const __mmask32 hisurrhi = _kshiftri_mask32(hisurr, 16);
-      taglo = _mm512_mask_mov_epi32(taglo, __mmask16(hisurr),
-                                    _mm512_set1_epi32(0x808080f0));
-      taghi = _mm512_mask_mov_epi32(taghi, __mmask16(hisurrhi),
-                                    _mm512_set1_epi32(0x808080f0));
+// Walks through a buffer in block-sized increments, loading the last part with
+// spaces
+template <size_t STEP_SIZE> struct buf_block_reader {
+public:
+  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  simdutf_really_inline size_t block_index();
+  simdutf_really_inline bool has_full_block() const;
+  simdutf_really_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0
+   * (in which case this function fills the buffer with spaces and returns 0. In
+   * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
+   * block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
+  simdutf_really_inline void advance();
 
-      lo = _mm512_mask_slli_epi32(lo, __mmask16(hisurr), lo, 10);
-      hi = _mm512_mask_slli_epi32(hi, __mmask16(hisurrhi), hi, 10);
-      los = _mm512_add_epi32(los, _mm512_set1_epi32(0xfca02400));
-      his = _mm512_add_epi32(his, _mm512_set1_epi32(0xfca02400));
-      lo = _mm512_mask_add_epi32(lo, __mmask16(hisurr), lo, los);
-      hi = _mm512_mask_add_epi32(hi, __mmask16(hisurrhi), hi, his);
+private:
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
+};
 
-      carryout = _cvtu32_mask32(_kshiftri_mask32(hisurr, 30));
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char *format_input_text_64(const uint8_t *text) {
+  static char *buf =
+      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
 
-      const uint32_t h = _cvtmask32_u32(hisurr);
-      const uint32_t l = _cvtmask32_u32(losurr);
-      // check for mismatched surrogates
-      if ((h + h + carry) ^ l) {
-        const uint32_t lonohi = l & ~(h + h + carry);
-        const uint32_t hinolo = h & ~(l >> 1);
-        inlen = _tzcnt_u32(hinolo | lonohi);
-        inmask = __mmask32(0x7fffffff & ((1U << inlen) - 1));
-        in = _mm512_maskz_mov_epi16(inmask, in);
-        adjust = (int)inlen - 31;
-        inlen = 0;
-        goto failiteration;
-      }
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char *format_input_text(const simd8x64<uint8_t> &in) {
+  static char *buf =
+      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  in.store(reinterpret_cast<uint8_t *>(buf));
+  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') {
+      buf[i] = '_';
     }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
 
-    hi = _mm512_maskz_mov_epi32(_cvtu32_mask16(0x7fff), hi);
-    carry = carryout;
+simdutf_unused static char *format_mask(uint64_t mask) {
+  static char *buf = reinterpret_cast<char *>(malloc(64 + 1));
+  for (size_t i = 0; i < 64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
+}
 
-    __m512i mslo =
-        _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), lo);
+template <size_t STEP_SIZE>
+simdutf_really_inline
+buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
+    : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
+      idx{0} {}
 
-    __m512i mshi =
-        _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), hi);
+template <size_t STEP_SIZE>
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
+  return idx;
+}
 
-    const __mmask32 outmask = __mmask32(_kandn_mask64(losurr, inmask));
-    const __mmask64 outmhi = _kshiftri_mask64(outmask, 16);
+template <size_t STEP_SIZE>
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
+}
 
-    const __mmask32 is1byte = __mmask32(_knot_mask64(is234byte));
-    const __mmask64 is1bhi = _kshiftri_mask64(is1byte, 16);
-    const __mmask64 is12bhi = _kshiftri_mask64(is12byte, 16);
+template <size_t STEP_SIZE>
+simdutf_really_inline const uint8_t *
+buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
+}
 
-    taglo = _mm512_mask_mov_epi32(taglo, __mmask16(is12byte),
-                                  _mm512_set1_epi32(0x80c00000));
-    taghi = _mm512_mask_mov_epi32(taghi, __mmask16(is12bhi),
-                                  _mm512_set1_epi32(0x80c00000));
-    __m512i magiclo = _mm512_mask_blend_epi32(__mmask16(outmask),
-                                              _mm512_set1_epi32(0xffffffff),
-                                              _mm512_set1_epi32(0x00010101));
-    __m512i magichi = _mm512_mask_blend_epi32(__mmask16(outmhi),
-                                              _mm512_set1_epi32(0xffffffff),
-                                              _mm512_set1_epi32(0x00010101));
+template <size_t STEP_SIZE>
+simdutf_really_inline size_t
+buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  if (len == idx) {
+    return 0;
+  } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+  std::memset(dst, 0x20,
+              STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
+                          // to write out 8 or 16 bytes at once.
+  std::memcpy(dst, buf + idx, len - idx);
+  return len - idx;
+}
 
-    magiclo = _mm512_mask_blend_epi32(__mmask16(outmask),
-                                      _mm512_set1_epi32(0xffffffff),
-                                      _mm512_set1_epi32(0x00010101));
-    magichi = _mm512_mask_blend_epi32(__mmask16(outmhi),
-                                      _mm512_set1_epi32(0xffffffff),
-                                      _mm512_set1_epi32(0x00010101));
+template <size_t STEP_SIZE>
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
+}
 
-    mslo = _mm512_ternarylogic_epi32(mslo, _mm512_set1_epi32(0x3f3f3f3f), taglo,
-                                     0xea); // A&B|C
-    mshi = _mm512_ternarylogic_epi32(mshi, _mm512_set1_epi32(0x3f3f3f3f), taghi,
-                                     0xea);
-    mslo = _mm512_mask_slli_epi32(mslo, __mmask16(is1byte), lo, 24);
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/buf_block_reader.h */
+/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace utf8_validation {
 
-    mshi = _mm512_mask_slli_epi32(mshi, __mmask16(is1bhi), hi, 24);
+using namespace simd;
 
-    const __mmask64 wantlo =
-        _mm512_cmp_epu8_mask(mslo, magiclo, _MM_CMPINT_NLT);
-    const __mmask64 wanthi =
-        _mm512_cmp_epu8_mask(mshi, magichi, _MM_CMPINT_NLT);
-    const __m512i outlo = _mm512_maskz_compress_epi8(wantlo, mslo);
-    const __m512i outhi = _mm512_maskz_compress_epi8(wanthi, mshi);
-    const uint64_t wantlo_uint64 = _cvtmask64_u64(wantlo);
-    const uint64_t wanthi_uint64 = _cvtmask64_u64(wanthi);
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
+
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
+}
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
+}
+
+//
+// Return nonzero if there are incomplete multibyte characters at the end of the
+// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
+//
+simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+  // If the previous input's last 3 bytes match this, they're too short (they
+  // ended at EOF):
+  // ... 1111____ 111_____ 11______
+  static const uint8_t max_array[32] = {255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        0b11110000u - 1,
+                                        0b11100000u - 1,
+                                        0b11000000u - 1};
+  const simd8<uint8_t> max_value(
+      &max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
+  return input.gt_bits(max_value);
+}
 
-    uint64_t advlo = _mm_popcnt_u64(wantlo_uint64);
-    uint64_t advhi = _mm_popcnt_u64(wanthi_uint64);
+struct utf8_checker {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
+  // The last input we received
+  simd8<uint8_t> prev_input_block;
+  // Whether the last input we received was incomplete (used for ASCII fast
+  // path)
+  simd8<uint8_t> prev_incomplete;
 
-    _mm512_mask_storeu_epi8(
-        outbuf, _cvtu64_mask64(_pext_u64(wantlo_uint64, wantlo_uint64)), outlo);
-    _mm512_mask_storeu_epi8(
-        outbuf + advlo, _cvtu64_mask64(_pext_u64(wanthi_uint64, wanthi_uint64)),
-        outhi);
-    outbuf += advlo + advhi;
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
   }
-  outbuf += -adjust;
 
-tail:
-  if (inlen != 0) {
-    // We must have inlen < 31.
-    inmask = _cvtu32_mask32((1U << inlen) - 1);
-    in = _mm512_maskz_loadu_epi16(inmask, inbuf);
-    if (big_endian) {
-      in = _mm512_shuffle_epi8(in, byteflip);
-    }
-    adjust = (int)inlen - 31;
-    inlen = 0;
-    goto lastiteration;
+  // The only problem that can happen at EOF is that a multibyte character is
+  // too short or a byte value too large in the last bytes: check_special_cases
+  // only checks for bytes too large in the first of two bytes.
+  simdutf_really_inline void check_eof() {
+    // If the previous block had incomplete UTF-8 characters at the end, an
+    // ASCII block can't possibly finish them.
+    this->error |= this->prev_incomplete;
   }
-  *outlen = (outbuf - outbuf_orig) + adjust;
-  return ((inbuf - inbuf_orig) + adjust);
-}
-/* end file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */
-/* begin file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */
-// file included directly
 
-/*
-  Returns a pair: the first unprocessed byte from buf and utf32_output
-  A scalar routing should carry on the conversion of the tail.
-*/
-template <endianness big_endian>
-std::tuple<const char16_t *, char32_t *, bool>
-convert_utf16_to_utf32(const char16_t *buf, size_t len,
-                       char32_t *utf32_output) {
-  const char16_t *end = buf + len;
-  const __m512i v_fc00 = _mm512_set1_epi16((uint16_t)0xfc00);
-  const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
-  const __m512i v_dc00 = _mm512_set1_epi16((uint16_t)0xdc00);
-  __mmask32 carry{0};
-  const __m512i byteflip = _mm512_setr_epi64(
-      0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
-      0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
-      0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  while (std::distance(buf, end) >= 32) {
-    // Always safe because buf + 32 <= end so that end - buf >= 32 bytes:
-    __m512i in = _mm512_loadu_si512((__m512i *)buf);
-    if (big_endian) {
-      in = _mm512_shuffle_epi8(in, byteflip);
+  simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
+    if (simdutf_likely(is_ascii(input))) {
+      this->error |= this->prev_incomplete;
+    } else {
+      // you might think that a for-loop would work, but under Visual Studio, it
+      // is not good enough.
+      static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                        (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+      if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+      } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+        this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+      }
+      this->prev_incomplete =
+          is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
+      this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
     }
+  }
 
-    // H - bitmask for high surrogates
-    const __mmask32 H =
-        _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_d800);
-    // H - bitmask for low surrogates
-    const __mmask32 L =
-        _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_dc00);
-
-    if ((H | L)) {
-      // surrogate pair(s) in a register
-      const __mmask32 V =
-          (L ^
-           (carry | (H << 1))); // A high surrogate must be followed by low one
-                                // and a low one must be preceded by a high one.
-                                // If valid, V should be equal to 0
-
-      if (V == 0) {
-        // valid case
-        /*
-            Input surrogate pair:
-            |1101.11aa.aaaa.aaaa|1101.10bb.bbbb.bbbb|
-                low surrogate      high surrogate
-        */
-        /*  1. Expand all code units to 32-bit code units
-            in
-           |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
-        */
-        const __m512i first = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
-        const __m512i second =
-            _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
-
-        /*  2. Shift by one 16-bit word to align low surrogates with high
-           surrogates in
-           |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
-            shifted
-           |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
-        */
-        const __m512i shifted_first = _mm512_alignr_epi32(second, first, 1);
-        const __m512i shifted_second =
-            _mm512_alignr_epi32(_mm512_setzero_si512(), second, 1);
+  // do not forget to call check_eof!
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
 
-        /*  3. Align all high surrogates in first and second by shifting to the
-           left by 10 bits
-            |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
-        */
-        const __m512i aligned_first =
-            _mm512_mask_slli_epi32(first, (__mmask16)H, first, 10);
-        const __m512i aligned_second =
-            _mm512_mask_slli_epi32(second, (__mmask16)(H >> 16), second, 10);
+}; // struct utf8_checker
+} // namespace utf8_validation
 
-        /*  4. Remove surrogate prefixes and add offset 0x10000 by adding in,
-           shifted and constant in
-           |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
-            shifted
-           |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
-            constant|1111.1100.1010.0000.0010.0100.0000.0000|1111.1100.1010.0000.0010.0100.0000.0000|
-        */
-        const __m512i constant = _mm512_set1_epi32((uint32_t)0xfca02400);
-        const __m512i added_first = _mm512_mask_add_epi32(
-            aligned_first, (__mmask16)H, aligned_first, shifted_first);
-        const __m512i utf32_first = _mm512_mask_add_epi32(
-            added_first, (__mmask16)H, added_first, constant);
+using utf8_validation::utf8_checker;
 
-        const __m512i added_second =
-            _mm512_mask_add_epi32(aligned_second, (__mmask16)(H >> 16),
-                                  aligned_second, shifted_second);
-        const __m512i utf32_second = _mm512_mask_add_epi32(
-            added_second, (__mmask16)(H >> 16), added_second, constant);
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+/* begin file src/generic/utf8_validation/utf8_validator.h */
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace utf8_validation {
 
-        //  5. Store all valid UTF-32 code units (low surrogate positions and
-        //  32nd word are invalid)
-        const __mmask32 valid = ~L & 0x7fffffff;
-        // We deliberately do a _mm512_maskz_compress_epi32 followed by
-        // storeu_epi32 to ease performance portability to Zen 4.
-        const __m512i compressed_first =
-            _mm512_maskz_compress_epi32((__mmask16)(valid), utf32_first);
-        const size_t howmany1 = count_ones((uint16_t)(valid));
-        _mm512_storeu_si512((__m512i *)utf32_output, compressed_first);
-        utf32_output += howmany1;
-        const __m512i compressed_second =
-            _mm512_maskz_compress_epi32((__mmask16)(valid >> 16), utf32_second);
-        const size_t howmany2 = count_ones((uint16_t)(valid >> 16));
-        // The following could be unsafe in some cases?
-        //_mm512_storeu_epi32((__m512i *) utf32_output, compressed_second);
-        _mm512_mask_storeu_epi32((__m512i *)utf32_output,
-                                 __mmask16((1 << howmany2) - 1),
-                                 compressed_second);
-        utf32_output += howmany2;
-        // Only process 31 code units, but keep track if the 31st word is a high
-        // surrogate as a carry
-        buf += 31;
-        carry = (H >> 30) & 0x1;
-      } else {
-        // invalid case
-        return std::make_tuple(buf + carry, utf32_output, false);
-      }
-    } else {
-      // no surrogates
-      // extend all thirty-two 16-bit code units to thirty-two 32-bit code units
-      _mm512_storeu_si512((__m512i *)(utf32_output),
-                          _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)));
-      _mm512_storeu_si512(
-          (__m512i *)(utf32_output) + 1,
-          _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1)));
-      utf32_output += 32;
-      buf += 32;
-      carry = 0;
-    }
-  } // while
-  return std::make_tuple(buf + carry, utf32_output, true);
-}
-/* end file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */
-/* begin file src/icelake/icelake_convert_utf32_to_latin1.inl.cpp */
-// file included directly
-size_t icelake_convert_utf32_to_latin1(const char32_t *buf, size_t len,
-                                       char *latin1_output) {
-  const char32_t *end = buf + len;
-  __m512i v_0xFF = _mm512_set1_epi32(0xff);
-  __m512i shufmask = _mm512_set_epi8(
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60,
-      56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0);
-  while (end - buf >= 16) {
-    __m512i in = _mm512_loadu_si512((__m512i *)buf);
-    if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
-      return 0;
-    }
-    _mm_storeu_si128(
-        (__m128i *)latin1_output,
-        _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
-    latin1_output += 16;
-    buf += 16;
-  }
-  if (buf < end) {
-    uint16_t mask = uint16_t((1 << (end - buf)) - 1);
-    __m512i in = _mm512_maskz_loadu_epi32(mask, buf);
-    if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
-      return 0;
-    }
-    _mm_mask_storeu_epi8(
-        latin1_output, mask,
-        _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
+/**
+ * Validates that the string is actual UTF-8.
+ */
+template <class checker>
+bool generic_validate_utf8(const uint8_t *input, size_t length) {
+  checker c{};
+  buf_block_reader<64> reader(input, length);
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    c.check_next_input(in);
+    reader.advance();
   }
-  return len;
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  c.check_next_input(in);
+  reader.advance();
+  c.check_eof();
+  return !c.errors();
 }
 
-std::pair<result, char *>
-icelake_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
-                                            char *latin1_output) {
-  const char32_t *end = buf + len;
-  const char32_t *start = buf;
-  __m512i v_0xFF = _mm512_set1_epi32(0xff);
-  __m512i shufmask = _mm512_set_epi8(
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60,
-      56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0);
-  while (end - buf >= 16) {
-    __m512i in = _mm512_loadu_si512((__m512i *)buf);
-    if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
-      while (uint32_t(*buf) <= 0xff) {
-        *latin1_output++ = uint8_t(*buf++);
-      }
-      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
-                            latin1_output);
+bool generic_validate_utf8(const char *input, size_t length) {
+  return generic_validate_utf8<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
+}
+
+/**
+ * Validates that the string is actual UTF-8 and stops on errors.
+ */
+template <class checker>
+result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
+  checker c{};
+  buf_block_reader<64> reader(input, length);
+  size_t count{0};
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    c.check_next_input(in);
+    if (c.errors()) {
+      if (count != 0) {
+        count--;
+      } // Sometimes the error is only detected in the next chunk
+      result res = scalar::utf8::rewind_and_validate_with_errors(
+          reinterpret_cast<const char *>(input),
+          reinterpret_cast<const char *>(input + count), length - count);
+      res.count += count;
+      return res;
     }
-    _mm_storeu_si128(
-        (__m128i *)latin1_output,
-        _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
-    latin1_output += 16;
-    buf += 16;
+    reader.advance();
+    count += 64;
   }
-  if (buf < end) {
-    uint16_t mask = uint16_t((1 << (end - buf)) - 1);
-    __m512i in = _mm512_maskz_loadu_epi32(mask, buf);
-    if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
-      while (uint32_t(*buf) <= 0xff) {
-        *latin1_output++ = uint8_t(*buf++);
-      }
-      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
-                            latin1_output);
-    }
-    _mm_mask_storeu_epi8(
-        latin1_output, mask,
-        _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  c.check_next_input(in);
+  reader.advance();
+  c.check_eof();
+  if (c.errors()) {
+    if (count != 0) {
+      count--;
+    } // Sometimes the error is only detected in the next chunk
+    result res = scalar::utf8::rewind_and_validate_with_errors(
+        reinterpret_cast<const char *>(input),
+        reinterpret_cast<const char *>(input) + count, length - count);
+    res.count += count;
+    return res;
+  } else {
+    return result(error_code::SUCCESS, length);
   }
-  return std::make_pair(result(error_code::SUCCESS, len), latin1_output);
 }
-/* end file src/icelake/icelake_convert_utf32_to_latin1.inl.cpp */
-/* begin file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */
-// file included directly
-
-// Todo: currently, this is just the haswell code, optimize for icelake kernel.
-std::pair<const char32_t *, char *>
-avx512_convert_utf32_to_utf8(const char32_t *buf, size_t len,
-                             char *utf8_output) {
-  const char32_t *end = buf + len;
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
-  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
-  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
-  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
-  __m256i running_max = _mm256_setzero_si256();
-  __m256i forbidden_bytemask = _mm256_setzero_si256();
-
-  const size_t safety_margin =
-      12; // to avoid overruns, see issue
-          // https://github.com/simdutf/simdutf/issues/92
 
-  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
-    __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
-    running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
+result generic_validate_utf8_with_errors(const char *input, size_t length) {
+  return generic_validate_utf8_with_errors<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
+}
 
-    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
-    // saturation
-    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
-                                        _mm256_and_si256(nextin, v_7fffffff));
-    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+template <class checker>
+bool generic_validate_ascii(const uint8_t *input, size_t length) {
+  buf_block_reader<64> reader(input, length);
+  uint8_t blocks[64]{};
+  simd::simd8x64<uint8_t> running_or(blocks);
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    running_or |= in;
+    reader.advance();
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  running_or |= in;
+  return running_or.is_ascii();
+}
 
-    // Try to apply UTF-16 => UTF-8 routine on 256 bits
-    // (haswell/avx2_convert_utf16_to_utf8.cpp)
+bool generic_validate_ascii(const char *input, size_t length) {
+  return generic_validate_ascii<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
+}
 
-    if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
-      // 1. pack the bytes
-      const __m128i utf8_packed = _mm_packus_epi16(
-          _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
-      // 2. store (16 bytes)
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-      // 3. adjust pointers
-      buf += 16;
-      utf8_output += 16;
-      continue; // we are done for this round!
+template <class checker>
+result generic_validate_ascii_with_errors(const uint8_t *input, size_t length) {
+  buf_block_reader<64> reader(input, length);
+  size_t count{0};
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    if (!in.is_ascii()) {
+      result res = scalar::ascii::validate_with_errors(
+          reinterpret_cast<const char *>(input + count), length - count);
+      return result(res.error, count + res.count);
     }
-    // no bits set above 7th bit
-    const __m256i one_byte_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m256i t3 = _mm256_or_si256(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m256i t4 = _mm256_or_si256(t3, v_c080);
+    reader.advance();
 
-      // 2. merge ASCII and 2-byte codewords
-      const __m256i utf8_unpacked =
-          _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+    count += 64;
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  if (!in.is_ascii()) {
+    result res = scalar::ascii::validate_with_errors(
+        reinterpret_cast<const char *>(input + count), length - count);
+    return result(res.error, count + res.count);
+  } else {
+    return result(error_code::SUCCESS, length);
+  }
+}
 
-      // 3. prepare bitmask for 8-bit lookup
-      const uint32_t M0 = one_byte_bitmask & 0x55555555;
-      const uint32_t M1 = M0 >> 7;
-      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
-      // 4. pack the bytes
+result generic_validate_ascii_with_errors(const char *input, size_t length) {
+  return generic_validate_ascii_with_errors<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
+}
 
-      const uint8_t *row =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-      const uint8_t *row_2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
-                                                                       16)][0];
+} // namespace utf8_validation
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_validator.h */
+// transcoding from UTF-8 to UTF-16
+/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 
-      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
-      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace utf8_to_utf16 {
+using namespace simd;
 
-      const __m256i utf8_packed = _mm256_shuffle_epi8(
-          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
-      // 5. store bytes
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_castsi256_si128(utf8_packed));
-      utf8_output += row[0];
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_extractf128_si256(utf8_packed, 1));
-      utf8_output += row_2[0];
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
-      // 6. adjust pointers
-      buf += 16;
-      continue;
-    }
-    // Must check for overflow in packing
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
-        _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-    if (saturation_bitmask == 0xffffffff) {
-      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
-      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
-      forbidden_bytemask = _mm256_or_si256(
-          forbidden_bytemask,
-          _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
 
-      const __m256i dup_even = _mm256_setr_epi16(
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
 
-      /* In this branch we handle three cases:
-        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
-        single UFT-8 byte
-        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
-        UTF-8 bytes
-        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
-        three UTF-8 bytes
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
 
-        We expand the input word (16-bit) into two code units (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
 
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
+}
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
+}
 
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
+struct validating_transcoder {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
 
-        Finally from these two code units we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+  validating_transcoder() : error(uint8_t(0)) {}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
+  }
 
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
-      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
-                                             simdutf_vec(0b0100000000000000));
-      const __m256i s4 = _mm256_xor_si256(s3, m0);
-#undef simdutf_vec
+  template <endianness endian>
+  simdutf_really_inline size_t convert(const char *in, size_t size,
+                                       char16_t *utf16_output) {
+    size_t pos = 0;
+    char16_t *start{utf16_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf16<endian>(utf16_output);
+        utf16_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (utf8_continuation_mask & 1) {
+          return 0; // error
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf16<endian>(
+              in + pos, utf8_end_of_code_point_mask, utf16_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      return 0;
+    }
+    if (pos < size) {
+      size_t howmany = scalar::utf8_to_utf16::convert<endian>(
+          in + pos, size - pos, utf16_output);
+      if (howmany == 0) {
+        return 0;
+      }
+      utf16_output += howmany;
+    }
+    return utf16_output - start;
+  }
 
-      // 4. expand code units 16-bit => 32-bit
-      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+  template <endianness endian>
+  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+                                                   char16_t *utf16_output) {
+    size_t pos = 0;
+    char16_t *start{utf16_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf16<endian>(utf16_output);
+        utf16_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (errors() || (utf8_continuation_mask & 1)) {
+          // rewind_and_convert_with_errors will seek a potential error from
+          // in+pos onward, with the ability to go back up to pos bytes, and
+          // read size-pos bytes forward.
+          result res =
+              scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+                  pos, in + pos, size - pos, utf16_output);
+          res.count += pos;
+          return res;
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf16<endian>(
+              in + pos, utf8_end_of_code_point_mask, utf16_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res =
+          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+              pos, in + pos, size - pos, utf16_output);
+      res.count += pos;
+      return res;
+    }
+    if (pos < size) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res =
+          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+              pos, in + pos, size - pos, utf16_output);
+      if (res.error) { // In case of error, we want the error position
+        res.count += pos;
+        return res;
+      } else { // In case of success, we want the number of word written
+        utf16_output += res.count;
+      }
+    }
+    return result(error_code::SUCCESS, utf16_output - start);
+  }
 
-      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-      // Due to the wider registers, the following path is less likely to be
-      // useful.
-      /*if(mask == 0) {
-        // We only have three-byte code units. Use fast path.
-        const __m256i shuffle =
-      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
-      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
-      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
-      _mm256_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
-        continue;
-      }*/
-      const uint8_t mask0 = uint8_t(mask);
-      const uint8_t *row0 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
-      const __m128i utf8_0 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
 
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-      const uint8_t *row1 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
-      const __m128i utf8_1 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+}; // struct utf8_checker
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
 
-      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-      const uint8_t *row2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
-      const __m128i utf8_2 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace utf8_to_utf16 {
 
-      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-      const uint8_t *row3 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
-      const __m128i utf8_3 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+using namespace simd;
 
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
-      utf8_output += row1[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
-      utf8_output += row2[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
-      utf8_output += row3[0];
-      buf += 16;
-    } else {
-      // case: at least one 32-bit word is larger than 0xFFFF <=> it will
-      // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
-      // wasteful to use scalar code, but being efficient with SIMD may require
-      // large, non-trivial tables?
-      size_t forward = 15;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint32_t word = buf[k];
-        if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
-          *utf8_output++ = char(word);
-        } else if ((word & 0xFFFFF800) == 0) { // 2-byte
-          *utf8_output++ = char((word >> 6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if ((word & 0xFFFF0000) == 0) { // 3-byte
-          if (word >= 0xD800 && word <= 0xDFFF) {
-            return std::make_pair(nullptr, utf8_output);
-          }
-          *utf8_output++ = char((word >> 12) | 0b11100000);
-          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else { // 4-byte
-          if (word > 0x10FFFF) {
-            return std::make_pair(nullptr, utf8_output);
-          }
-          *utf8_output++ = char((word >> 18) | 0b11110000);
-          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        }
+template <endianness endian>
+simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
+                                         char16_t *utf16_output) noexcept {
+  // The implementation is not specific to haswell and should be moved to the
+  // generic directory.
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    // this loop could be unrolled further. For example, we could process the
+    // mask far more than 64 bytes.
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if (in.is_ascii()) {
+      in.store_ascii_as_utf16<endian>(utf16_output);
+      utf16_output += 64;
+      pos += 64;
+    } else {
+      // Slow path. We hope that the compiler will recognize that this is a slow
+      // path. Anything that is not a continuation mask is a 'leading byte',
+      // that is, the start of a new code point.
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+      // -65 is 0b10111111 in two-complement's, so largest possible continuation
+      // byte
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      // The *start* of code points is not so useful, rather, we want the *end*
+      // of code points.
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times when using solely
+      // the slow/regular path, and at least four times if there are fast paths.
+      while (pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        //
+        // Thus we may allow convert_masked_utf8_to_utf16 to process
+        // more bytes at a time under a fast-path mode where 16 bytes
+        // are consumed at once (e.g., when encountering ASCII).
+        size_t consumed = convert_masked_utf8_to_utf16<endian>(
+            input + pos, utf8_end_of_code_point_mask, utf16_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
       }
-      buf += k;
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block. These bytes will be processed again. So we have an
+      // 80% efficiency (in the worst case). In practice we expect an
+      // 85% to 90% efficiency.
     }
-  } // while
-
-  // check for invalid input
-  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
-  if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(
-          _mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
-    return std::make_pair(nullptr, utf8_output);
   }
-
-  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
-    return std::make_pair(nullptr, utf8_output);
-  }
-
-  return std::make_pair(buf, utf8_output);
+  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
+      input + pos, size - pos, utf16_output);
+  return utf16_output - start;
 }
 
-// Todo: currently, this is just the haswell code, optimize for icelake kernel.
-std::pair<result, char *>
-avx512_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
-                                         char *utf8_output) {
-  const char32_t *end = buf + len;
-  const char32_t *start = buf;
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+// transcoding from UTF-8 to UTF-32
+/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
-  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
-  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
-  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
-  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace utf8_to_utf32 {
+using namespace simd;
 
-  const size_t safety_margin =
-      12; // to avoid overruns, see issue
-          // https://github.com/simdutf/simdutf/issues/92
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
-  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
-    __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
-    // Check for too large input
-    const __m256i max_input =
-        _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
-    if (static_cast<uint32_t>(_mm256_movemask_epi8(
-            _mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
-      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
-                            utf8_output);
-    }
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
 
-    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
-    // saturation
-    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
-                                        _mm256_and_si256(nextin, v_7fffffff));
-    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
 
-    // Try to apply UTF-16 => UTF-8 routine on 256 bits
-    // (haswell/avx2_convert_utf16_to_utf8.cpp)
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
 
-    if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
-      // 1. pack the bytes
-      const __m128i utf8_packed = _mm_packus_epi16(
-          _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
-      // 2. store (16 bytes)
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-      // 3. adjust pointers
-      buf += 16;
-      utf8_output += 16;
-      continue; // we are done for this round!
-    }
-    // no bits set above 7th bit
-    const __m256i one_byte_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
 
-    // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
+}
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
+}
 
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m256i t3 = _mm256_or_si256(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m256i t4 = _mm256_or_si256(t3, v_c080);
+struct validating_transcoder {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
 
-      // 2. merge ASCII and 2-byte codewords
-      const __m256i utf8_unpacked =
-          _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+  validating_transcoder() : error(uint8_t(0)) {}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
+  }
 
-      // 3. prepare bitmask for 8-bit lookup
-      const uint32_t M0 = one_byte_bitmask & 0x55555555;
-      const uint32_t M1 = M0 >> 7;
-      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
-      // 4. pack the bytes
+  simdutf_really_inline size_t convert(const char *in, size_t size,
+                                       char32_t *utf32_output) {
+    size_t pos = 0;
+    char32_t *start{utf32_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 16 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the fourth
+    // last leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf32(utf32_output);
+        utf32_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (utf8_continuation_mask & 1) {
+          return 0; // we have an error
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf32(
+              in + pos, utf8_end_of_code_point_mask, utf32_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      return 0;
+    }
+    if (pos < size) {
+      size_t howmany =
+          scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+      if (howmany == 0) {
+        return 0;
+      }
+      utf32_output += howmany;
+    }
+    return utf32_output - start;
+  }
 
-      const uint8_t *row =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-      const uint8_t *row_2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
-                                                                       16)][0];
+  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+                                                   char32_t *utf32_output) {
+    size_t pos = 0;
+    char32_t *start{utf32_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the fourth
+    // last leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf32(utf32_output);
+        utf32_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (errors() || (utf8_continuation_mask & 1)) {
+          result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+              pos, in + pos, size - pos, utf32_output);
+          res.count += pos;
+          return res;
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf32(
+              in + pos, utf8_end_of_code_point_mask, utf32_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, utf32_output);
+      res.count += pos;
+      return res;
+    }
+    if (pos < size) {
+      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, utf32_output);
+      if (res.error) { // In case of error, we want the error position
+        res.count += pos;
+        return res;
+      } else { // In case of success, we want the number of word written
+        utf32_output += res.count;
+      }
+    }
+    return result(error_code::SUCCESS, utf32_output - start);
+  }
 
-      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
-      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
 
-      const __m256i utf8_packed = _mm256_shuffle_epi8(
-          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
-      // 5. store bytes
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_castsi256_si128(utf8_packed));
-      utf8_output += row[0];
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_extractf128_si256(utf8_packed, 1));
-      utf8_output += row_2[0];
+}; // struct utf8_checker
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
+/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
 
-      // 6. adjust pointers
-      buf += 16;
-      continue;
-    }
-    // Must check for overflow in packing
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
-        _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-    if (saturation_bitmask == 0xffffffff) {
-      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace utf8_to_utf32 {
 
-      // Check for illegal surrogate code units
-      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
-      const __m256i forbidden_bytemask =
-          _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
-          0x0) {
-        return std::make_pair(result(error_code::SURROGATE, buf - start),
-                              utf8_output);
+using namespace simd;
+
+simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
+                                         char32_t *utf32_output) noexcept {
+  size_t pos = 0;
+  char32_t *start{utf32_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if (in.is_ascii()) {
+      in.store_ascii_as_utf32(utf32_output);
+      utf32_output += 64;
+      pos += 64;
+    } else {
+      // -65 is 0b10111111 in two-complement's, so largest possible continuation
+      // byte
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      size_t max_starting_point = (pos + 64) - 12;
+      while (pos < max_starting_point) {
+        size_t consumed = convert_masked_utf8_to_utf32(
+            input + pos, utf8_end_of_code_point_mask, utf32_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
       }
+    }
+  }
+  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
+                                                       utf32_output);
+  return utf32_output - start;
+}
 
-      const __m256i dup_even = _mm256_setr_epi16(
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-      /* In this branch we handle three cases:
-        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
-        single UFT-8 byte
-        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
-        UTF-8 bytes
-        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
-        three UTF-8 bytes
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
+// other functions
+/* begin file src/generic/utf16.h */
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace utf16 {
 
-        We expand the input word (16-bit) into two code units (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
+template <endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t *in,
+                                               size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos < size / 32 * 32; pos += 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    if (!match_system(big_endian)) {
+      input.swap_bytes();
+    }
+    uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+    count += count_ones(not_pair) / 2;
+  }
+  return count +
+         scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
+}
 
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
+template <endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in,
+                                                    size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  // This algorithm could no doubt be improved!
+  for (; pos < size / 32 * 32; pos += 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    if (!match_system(big_endian)) {
+      input.swap_bytes();
+    }
+    uint64_t ascii_mask = input.lteq(0x7F);
+    uint64_t twobyte_mask = input.lteq(0x7FF);
+    uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
 
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
+    size_t ascii_count = count_ones(ascii_mask) / 2;
+    size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
+    size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
+    size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+    count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count +
+             ascii_count;
+  }
+  return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
+                                                                   size - pos);
+}
 
-        Finally from these two code units we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+template <endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
+                                                     size_t size) {
+  return count_code_points<big_endian>(in, size);
+}
 
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
-      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
-                                             simdutf_vec(0b0100000000000000));
-      const __m256i s4 = _mm256_xor_si256(s3, m0);
-#undef simdutf_vec
+simdutf_really_inline void
+change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
+  size_t pos = 0;
 
-      // 4. expand code units 16-bit => 32-bit
-      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+  while (pos < size / 32 * 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    input.swap_bytes();
+    input.store(reinterpret_cast<uint16_t *>(output));
+    pos += 32;
+    output += 32;
+  }
 
-      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-      // Due to the wider registers, the following path is less likely to be
-      // useful.
-      /*if(mask == 0) {
-        // We only have three-byte code units. Use fast path.
-        const __m256i shuffle =
-      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
-      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
-      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
-      _mm256_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
-        continue;
-      }*/
-      const uint8_t mask0 = uint8_t(mask);
-      const uint8_t *row0 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
-      const __m128i utf8_0 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+}
 
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-      const uint8_t *row1 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
-      const __m128i utf8_1 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+} // namespace utf16
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/utf16.h */
+/* begin file src/generic/utf8.h */
 
-      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-      const uint8_t *row2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
-      const __m128i utf8_2 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace utf8 {
 
-      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-      const uint8_t *row3 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
-      const __m128i utf8_3 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+using namespace simd;
 
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
-      utf8_output += row1[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
-      utf8_output += row2[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
-      utf8_output += row3[0];
-      buf += 16;
-    } else {
-      // case: at least one 32-bit word is larger than 0xFFFF <=> it will
-      // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
-      // wasteful to use scalar code, but being efficient with SIMD may require
-      // large, non-trivial tables?
-      size_t forward = 15;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint32_t word = buf[k];
-        if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
-          *utf8_output++ = char(word);
-        } else if ((word & 0xFFFFF800) == 0) { // 2-byte
-          *utf8_output++ = char((word >> 6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if ((word & 0xFFFF0000) == 0) { // 3-byte
-          if (word >= 0xD800 && word <= 0xDFFF) {
-            return std::make_pair(
-                result(error_code::SURROGATE, buf - start + k), utf8_output);
-          }
-          *utf8_output++ = char((word >> 12) | 0b11100000);
-          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else { // 4-byte
-          if (word > 0x10FFFF) {
-            return std::make_pair(
-                result(error_code::TOO_LARGE, buf - start + k), utf8_output);
-          }
-          *utf8_output++ = char((word >> 18) | 0b11110000);
-          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        }
-      }
-      buf += k;
-    }
-  } // while
+simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos + 64 <= size; pos += 64) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    uint64_t utf8_continuation_mask = input.gt(-65);
+    count += count_ones(utf8_continuation_mask);
+  }
+  return count + scalar::utf8::count_code_points(in + pos, size - pos);
+}
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
+                                                    size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  // This algorithm could no doubt be improved!
+  for (; pos + 64 <= size; pos += 64) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+    // We count one word for anything that is not a continuation (so
+    // leading bytes).
+    count += 64 - count_ones(utf8_continuation_mask);
+    int64_t utf8_4byte = input.gteq_unsigned(240);
+    count += count_ones(utf8_4byte);
+  }
+  return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
 }
-/* end file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */
-/* begin file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */
-// file included directly
+} // namespace utf8
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/utf8.h */
 
-// Todo: currently, this is just the haswell code, optimize for icelake kernel.
-template <endianness big_endian>
-std::pair<const char32_t *, char16_t *>
-avx512_convert_utf32_to_utf16(const char32_t *buf, size_t len,
-                              char16_t *utf16_output) {
-  const char32_t *end = buf + len;
+//
+// Implementation-specific overrides
+//
+namespace simdutf {
+namespace ppc64 {
 
-  const size_t safety_margin =
-      12; // to avoid overruns, see issue
-          // https://github.com/simdutf/simdutf/issues/92
-  __m256i forbidden_bytemask = _mm256_setzero_si256();
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+                                 size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  if (bom_encoding != encoding_type::unspecified) {
+    return bom_encoding;
+  }
+  // todo: reimplement as a one-pass algorithm.
+  int out = 0;
+  if (validate_utf8(input, length)) {
+    out |= encoding_type::UTF8;
+  }
+  if ((length % 2) == 0) {
+    if (validate_utf16(reinterpret_cast<const char16_t *>(input), length / 2)) {
+      out |= encoding_type::UTF16_LE;
+    }
+  }
+  if ((length % 4) == 0) {
+    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+      out |= encoding_type::UTF32_LE;
+    }
+  }
 
-  while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+  return out;
+}
 
-    const __m256i v_00000000 = _mm256_setzero_si256();
-    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+simdutf_warn_unused bool
+implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return ppc64::utf8_validation::generic_validate_utf8(buf, len);
+}
 
-    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
-    const __m256i saturation_bytemask =
-        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-    const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+    const char *buf, size_t len) const noexcept {
+  return ppc64::utf8_validation::generic_validate_utf8_with_errors(buf, len);
+}
 
-    if (saturation_bitmask == 0xffffffff) {
-      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
-      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
-      forbidden_bytemask = _mm256_or_si256(
-          forbidden_bytemask,
-          _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+  return ppc64::utf8_validation::generic_validate_ascii(buf, len);
+}
 
-      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
-                                              _mm256_extractf128_si256(in, 1));
-      if (big_endian) {
-        const __m128i swap =
-            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
-      }
-      _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
-      utf16_output += 8;
-      buf += 8;
-    } else {
-      size_t forward = 7;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint32_t word = buf[k];
-        if ((word & 0xFFFF0000) == 0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) {
-            return std::make_pair(nullptr, utf16_output);
-          }
-          *utf16_output++ =
-              big_endian
-                  ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
-                  : char16_t(word);
-        } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) {
-            return std::make_pair(nullptr, utf16_output);
-          }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (big_endian) {
-            high_surrogate =
-                uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-            low_surrogate =
-                uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
-        }
-      }
-      buf += k;
-    }
-  }
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+    const char *buf, size_t len) const noexcept {
+  return ppc64::utf8_validation::generic_validate_ascii_with_errors(buf, len);
+}
 
-  // check for invalid input
-  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
-    return std::make_pair(nullptr, utf16_output);
-  }
+simdutf_warn_unused bool
+implementation::validate_utf16le(const char16_t *buf,
+                                 size_t len) const noexcept {
+  return scalar::utf16::validate<endianness::LITTLE>(buf, len);
+}
 
-  return std::make_pair(buf, utf16_output);
+simdutf_warn_unused bool
+implementation::validate_utf16be(const char16_t *buf,
+                                 size_t len) const noexcept {
+  return scalar::utf16::validate<endianness::BIG>(buf, len);
 }
 
-// Todo: currently, this is just the haswell code, optimize for icelake kernel.
-template <endianness big_endian>
-std::pair<result, char16_t *>
-avx512_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
-                                          char16_t *utf16_output) {
-  const char32_t *start = buf;
-  const char32_t *end = buf + len;
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
+}
 
-  const size_t safety_margin =
-      12; // to avoid overruns, see issue
-          // https://github.com/simdutf/simdutf/issues/92
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
+}
 
-  while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+simdutf_warn_unused result implementation::validate_utf32_with_errors(
+    const char32_t *buf, size_t len) const noexcept {
+  return scalar::utf32::validate_with_errors(buf, len);
+}
 
-    const __m256i v_00000000 = _mm256_setzero_si256();
-    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+simdutf_warn_unused bool
+implementation::validate_utf32(const char16_t *buf, size_t len) const noexcept {
+  return scalar::utf32::validate(buf, len);
+}
 
-    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
-    const __m256i saturation_bytemask =
-        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-    const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+    const char * /*buf*/, size_t /*len*/,
+    char16_t * /*utf16_output*/) const noexcept {
+  return 0; // stub
+}
 
-    if (saturation_bitmask == 0xffffffff) {
-      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
-      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
-      const __m256i forbidden_bytemask =
-          _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
-          0x0) {
-        return std::make_pair(result(error_code::SURROGATE, buf - start),
-                              utf16_output);
-      }
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+    const char * /*buf*/, size_t /*len*/,
+    char16_t * /*utf16_output*/) const noexcept {
+  return 0; // stub
+}
 
-      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
-                                              _mm256_extractf128_si256(in, 1));
-      if (big_endian) {
-        const __m128i swap =
-            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
-      }
-      _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
-      utf16_output += 8;
-      buf += 8;
-    } else {
-      size_t forward = 7;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint32_t word = buf[k];
-        if ((word & 0xFFFF0000) == 0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) {
-            return std::make_pair(
-                result(error_code::SURROGATE, buf - start + k), utf16_output);
-          }
-          *utf16_output++ =
-              big_endian
-                  ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
-                  : char16_t(word);
-        } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) {
-            return std::make_pair(
-                result(error_code::TOO_LARGE, buf - start + k), utf16_output);
-          }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (big_endian) {
-            high_surrogate =
-                uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-            low_surrogate =
-                uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
-        }
-      }
-      buf += k;
-    }
-  }
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+    const char * /*buf*/, size_t /*len*/,
+    char16_t * /*utf16_output*/) const noexcept {
+  return result(error_code::OTHER, 0); // stub
+}
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+    const char * /*buf*/, size_t /*len*/,
+    char16_t * /*utf16_output*/) const noexcept {
+  return result(error_code::OTHER, 0); // stub
 }
-/* end file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */
-/* begin file src/icelake/icelake_ascii_validation.inl.cpp */
-// file included directly
 
-bool validate_ascii(const char *buf, size_t len) {
-  const char *end = buf + len;
-  const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
-  __m512i running_or = _mm512_setzero_si512();
-  for (; end - buf >= 64; buf += 64) {
-    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)buf);
-    running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii,
-                                           0xf8); // running_or | (utf8 & ascii)
-  }
-  if (buf < end) {
-    const __m512i utf8 = _mm512_maskz_loadu_epi8(
-        (uint64_t(1) << (end - buf)) - 1, (const __m512i *)buf);
-    running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii,
-                                           0xf8); // running_or | (utf8 & ascii)
-  }
-  return (_mm512_test_epi8_mask(running_or, running_or) == 0);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+    const char * /*buf*/, size_t /*len*/,
+    char16_t * /*utf16_output*/) const noexcept {
+  return 0; // stub
 }
-/* end file src/icelake/icelake_ascii_validation.inl.cpp */
-/* begin file src/icelake/icelake_utf32_validation.inl.cpp */
-// file included directly
 
-const char32_t *validate_utf32(const char32_t *buf, size_t len) {
-  if (len < 16) {
-    return buf;
-  }
-  const char32_t *end = buf + len - 16;
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+    const char * /*buf*/, size_t /*len*/,
+    char16_t * /*utf16_output*/) const noexcept {
+  return 0; // stub
+}
 
-  const __m512i offset = _mm512_set1_epi32((uint32_t)0xffff2000);
-  __m512i currentmax = _mm512_setzero_si512();
-  __m512i currentoffsetmax = _mm512_setzero_si512();
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+    const char * /*buf*/, size_t /*len*/,
+    char32_t * /*utf16_output*/) const noexcept {
+  return 0; // stub
+}
 
-  while (buf <= end) {
-    __m512i utf32 = _mm512_loadu_si512((const __m512i *)buf);
-    buf += 16;
-    currentoffsetmax =
-        _mm512_max_epu32(_mm512_add_epi32(utf32, offset), currentoffsetmax);
-    currentmax = _mm512_max_epu32(utf32, currentmax);
-  }
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+    const char * /*buf*/, size_t /*len*/,
+    char32_t * /*utf16_output*/) const noexcept {
+  return result(error_code::OTHER, 0); // stub
+}
 
-  const __m512i standardmax = _mm512_set1_epi32((uint32_t)0x10ffff);
-  const __m512i standardoffsetmax = _mm512_set1_epi32((uint32_t)0xfffff7ff);
-  __m512i is_zero =
-      _mm512_xor_si512(_mm512_max_epu32(currentmax, standardmax), standardmax);
-  if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) {
-    return nullptr;
-  }
-  is_zero = _mm512_xor_si512(
-      _mm512_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-  if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) {
-    return nullptr;
-  }
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+    const char * /*buf*/, size_t /*len*/,
+    char32_t * /*utf16_output*/) const noexcept {
+  return 0; // stub
+}
 
-  return buf;
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len,
+                                                            utf8_output);
 }
-/* end file src/icelake/icelake_utf32_validation.inl.cpp */
-/* begin file src/icelake/icelake_convert_latin1_to_utf8.inl.cpp */
-// file included directly
 
-static inline size_t latin1_to_utf8_avx512_vec(__m512i input, size_t input_len,
-                                               char *utf8_output,
-                                               int mask_output) {
-  __mmask64 nonascii = _mm512_movepi8_mask(input);
-  size_t output_size = input_len + (size_t)count_ones(nonascii);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
+}
 
-  // Mask to denote whether the byte is a leading byte that is not ascii
-  __mmask64 sixth = _mm512_cmpge_epu8_mask(
-      input, _mm512_set1_epi8(-64)); // binary representation of -64: 1100 0000
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+      buf, len, utf8_output);
+}
 
-  const uint64_t alternate_bits = UINT64_C(0x5555555555555555);
-  uint64_t ascii = ~nonascii;
-  // the bits in ascii are inverted and zeros are interspersed in between them
-  uint64_t maskA = ~_pdep_u64(ascii, alternate_bits);
-  uint64_t maskB = ~_pdep_u64(ascii >> 32, alternate_bits);
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+      buf, len, utf8_output);
+}
 
-  // interleave bytes from top and bottom halves (abcd...ABCD -> aAbBcCdD)
-  __m512i input_interleaved = _mm512_permutexvar_epi8(
-      _mm512_set_epi32(0x3f1f3e1e, 0x3d1d3c1c, 0x3b1b3a1a, 0x39193818,
-                       0x37173616, 0x35153414, 0x33133212, 0x31113010,
-                       0x2f0f2e0e, 0x2d0d2c0c, 0x2b0b2a0a, 0x29092808,
-                       0x27072606, 0x25052404, 0x23032202, 0x21012000),
-      input);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len,
+                                                                  utf8_output);
+}
 
-  // double size of each byte, and insert the leading byte 1100 0010
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len,
+                                                               utf8_output);
+}
 
-  /*
-  upscale the bytes to 16-bit value, adding the 0b11000000 leading byte in the
-  process. We adjust for the bytes that have their two most significant bits.
-  This takes care of the first 32 bytes, assuming we interleaved the bytes. */
-  __m512i outputA =
-      _mm512_shldi_epi16(input_interleaved, _mm512_set1_epi8(-62), 8);
-  outputA = _mm512_mask_add_epi16(
-      outputA, (__mmask32)sixth, outputA,
-      _mm512_set1_epi16(1 - 0x4000)); // 1- 0x4000 = 1100 0000 0000 0001????
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
+}
 
-  // in the second 32-bit half, set first or second option based on whether
-  // original input is leading byte (second case) or not (first case)
-  __m512i leadingB =
-      _mm512_mask_blend_epi16((__mmask32)(sixth >> 32),
-                              _mm512_set1_epi16(0x00c2),  // 0000 0000 1101 0010
-                              _mm512_set1_epi16(0x40c3)); // 0100 0000 1100 0011
-  __m512i outputB = _mm512_ternarylogic_epi32(
-      input_interleaved, leadingB, _mm512_set1_epi16((short)0xff00),
-      (240 & 170) ^ 204); // (input_interleaved & 0xff00) ^ leadingB
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
+}
 
-  // prune redundant bytes
-  outputA = _mm512_maskz_compress_epi8(maskA, outputA);
-  outputB = _mm512_maskz_compress_epi8(maskB, outputB);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
+}
 
-  size_t output_sizeA = (size_t)count_ones((uint32_t)nonascii) + 32;
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len,
+                                                             utf16_output);
+}
 
-  if (mask_output) {
-    if (input_len > 32) { // is the second half of the input vector used?
-      __mmask64 write_mask = _bzhi_u64(~0ULL, (unsigned int)output_sizeA);
-      _mm512_mask_storeu_epi8(utf8_output, write_mask, outputA);
-      utf8_output += output_sizeA;
-      write_mask = _bzhi_u64(~0ULL, (unsigned int)(output_size - output_sizeA));
-      _mm512_mask_storeu_epi8(utf8_output, write_mask, outputB);
-    } else {
-      __mmask64 write_mask = _bzhi_u64(~0ULL, (unsigned int)output_size);
-      _mm512_mask_storeu_epi8(utf8_output, write_mask, outputA);
-    }
-  } else {
-    _mm512_storeu_si512(utf8_output, outputA);
-    utf8_output += output_sizeA;
-    _mm512_storeu_si512(utf8_output, outputB);
-  }
-  return output_size;
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len,
+                                                          utf16_output);
 }
 
-static inline size_t latin1_to_utf8_avx512_branch(__m512i input,
-                                                  char *utf8_output) {
-  __mmask64 nonascii = _mm512_movepi8_mask(input);
-  if (nonascii) {
-    return latin1_to_utf8_avx512_vec(input, 64, utf8_output, 0);
-  } else {
-    _mm512_storeu_si512(utf8_output, input);
-    return 64;
-  }
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+      buf, len, utf16_output);
 }
 
-size_t latin1_to_utf8_avx512_start(const char *buf, size_t len,
-                                   char *utf8_output) {
-  char *start = utf8_output;
-  size_t pos = 0;
-  // if there's at least 128 bytes remaining, we don't need to mask the output
-  for (; pos + 128 <= len; pos += 64) {
-    __m512i input = _mm512_loadu_si512((__m512i *)(buf + pos));
-    utf8_output += latin1_to_utf8_avx512_branch(input, utf8_output);
-  }
-  // in the last 128 bytes, the first 64 may require masking the output
-  if (pos + 64 <= len) {
-    __m512i input = _mm512_loadu_si512((__m512i *)(buf + pos));
-    utf8_output += latin1_to_utf8_avx512_vec(input, 64, utf8_output, 1);
-    pos += 64;
-  }
-  // with the last 64 bytes, the input also needs to be masked
-  if (pos < len) {
-    __mmask64 load_mask = _bzhi_u64(~0ULL, (unsigned int)(len - pos));
-    __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)(buf + pos));
-    utf8_output += latin1_to_utf8_avx512_vec(input, len - pos, utf8_output, 1);
-  }
-  return (size_t)(utf8_output - start);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+      buf, len, utf16_output);
 }
-/* end file src/icelake/icelake_convert_latin1_to_utf8.inl.cpp */
-/* begin file src/icelake/icelake_convert_latin1_to_utf16.inl.cpp */
-// file included directly
-template <endianness big_endian>
-size_t icelake_convert_latin1_to_utf16(const char *latin1_input, size_t len,
-                                       char16_t *utf16_output) {
-  size_t rounded_len = len & ~0x1F; // Round down to nearest multiple of 32
 
-  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  for (size_t i = 0; i < rounded_len; i += 32) {
-    // Load 32 Latin1 characters into a 256-bit register
-    __m256i in = _mm256_loadu_si256((__m256i *)&latin1_input[i]);
-    // Zero extend each set of 8 Latin1 characters to 32 16-bit integers
-    __m512i out = _mm512_cvtepu8_epi16(in);
-    if (big_endian) {
-      out = _mm512_shuffle_epi8(out, byteflip);
-    }
-    // Store the results back to memory
-    _mm512_storeu_si512((__m512i *)&utf16_output[i], out);
-  }
-  if (rounded_len != len) {
-    uint32_t mask = uint32_t(1 << (len - rounded_len)) - 1;
-    __m256i in = _mm256_maskz_loadu_epi8(mask, latin1_input + rounded_len);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(
+      buf, len, utf16_output);
+}
 
-    // Zero extend each set of 8 Latin1 characters to 32 16-bit integers
-    __m512i out = _mm512_cvtepu8_epi16(in);
-    if (big_endian) {
-      out = _mm512_shuffle_epi8(out, byteflip);
-    }
-    // Store the results back to memory
-    _mm512_mask_storeu_epi16(utf16_output + rounded_len, mask, out);
-  }
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len,
+                                                                utf16_output);
+}
 
-  return len;
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len,
+                                                             utf32_output);
 }
-/* end file src/icelake/icelake_convert_latin1_to_utf16.inl.cpp */
-/* begin file src/icelake/icelake_convert_latin1_to_utf32.inl.cpp */
-std::pair<const char *, char32_t *>
-avx512_convert_latin1_to_utf32(const char *buf, size_t len,
-                               char32_t *utf32_output) {
-  size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
 
-  for (size_t i = 0; i < rounded_len; i += 16) {
-    // Load 16 Latin1 characters into a 128-bit register
-    __m128i in = _mm_loadu_si128((__m128i *)&buf[i]);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len,
+                                                          utf32_output);
+}
 
-    // Zero extend each set of 8 Latin1 characters to 16 32-bit integers using
-    // vpmovzxbd
-    __m512i out = _mm512_cvtepu8_epi32(in);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+      buf, len, utf32_output);
+}
 
-    // Store the results back to memory
-    _mm512_storeu_si512((__m512i *)&utf32_output[i], out);
-  }
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+      buf, len, utf32_output);
+}
 
-  // Return pointers pointing to where we left off
-  return std::make_pair(buf + rounded_len, utf32_output + rounded_len);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(
+      buf, len, utf32_output);
 }
-/* end file src/icelake/icelake_convert_latin1_to_utf32.inl.cpp */
-/* begin file src/icelake/icelake_base64.inl.cpp */
-// file included directly
-/**
- * References and further reading:
- *
- * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
- * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
- * https://arxiv.org/abs/1910.05109
- *
- * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
- * Instructions, ACM Transactions on the Web 12 (3), 2018.
- * https://arxiv.org/abs/1704.00605
- *
- * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
- * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
- * Request for Comments: 4648.
- *
- * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
- * http://www.alfredklomp.com/programming/sse-base64/. (2014).
- *
- * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
- * acceleration. https://github.com/aklomp/base64. (2014).
- *
- * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
- * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
- *
- * Nick Kopp. 2013. Base64 Encoding on a GPU.
- * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
- */
 
-struct block64 {
-  __m512i chunks[1];
-};
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len,
+                                                                utf32_output);
+}
 
-template <bool base64_url>
-size_t encode_base64(char *dst, const char *src, size_t srclen,
-                     base64_options options) {
-  // credit: Wojciech Muła
-  const uint8_t *input = (const uint8_t *)src;
+void implementation::change_endianness_utf16(const char16_t *input,
+                                             size_t length,
+                                             char16_t *output) const noexcept {
+  scalar::utf16::change_endianness_utf16(input, length, output);
+}
 
-  uint8_t *out = (uint8_t *)dst;
-  static const char *lookup_tbl =
-      base64_url
-          ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
-          : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+simdutf_warn_unused size_t implementation::count_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
+}
 
-  const __m512i shuffle_input = _mm512_setr_epi32(
-      0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, 0x0d0e0c0d, 0x10110f10,
-      0x13141213, 0x16171516, 0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122,
-      0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e);
-  const __m512i lookup =
-      _mm512_loadu_si512(reinterpret_cast<const __m512i *>(lookup_tbl));
-  const __m512i multi_shifts = _mm512_set1_epi64(UINT64_C(0x3036242a1016040a));
-  size_t size = srclen;
-  __mmask64 input_mask = 0xffffffffffff; // (1 << 48) - 1
-  while (size >= 48) {
-    const __m512i v = _mm512_maskz_loadu_epi8(
-        input_mask, reinterpret_cast<const __m512i *>(input));
-    const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v);
-    const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in);
-    const __m512i result = _mm512_permutexvar_epi8(indices, lookup);
-    _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), result);
-    out += 64;
-    input += 48;
-    size -= 48;
-  }
-  input_mask = ((__mmask64)1 << size) - 1;
-  const __m512i v = _mm512_maskz_loadu_epi8(
-      input_mask, reinterpret_cast<const __m512i *>(input));
-  const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v);
-  const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in);
-  bool padding_needed =
-      (((options & base64_url) == 0) ^
-       ((options & base64_reverse_padding) == base64_reverse_padding));
-  size_t padding_amount = ((size % 3) > 0) ? (3 - (size % 3)) : 0;
-  size_t output_len = ((size + 2) / 3) * 4;
-  size_t non_padded_output_len = output_len - padding_amount;
-  if (!padding_needed) {
-    output_len = non_padded_output_len;
-  }
-  __mmask64 output_mask = output_len == 64 ? (__mmask64)UINT64_MAX
-                                           : ((__mmask64)1 << output_len) - 1;
-  __m512i result = _mm512_mask_permutexvar_epi8(
-      _mm512_set1_epi8('='), ((__mmask64)1 << non_padded_output_len) - 1,
-      indices, lookup);
-  _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out), output_mask,
-                          result);
-  return (size_t)(out - (uint8_t *)dst) + output_len;
+simdutf_warn_unused size_t implementation::count_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::count_code_points<endianness::BIG>(input, length);
 }
 
-template <bool base64_url>
-static inline uint64_t to_base64_mask(block64 *b, uint64_t *error) {
-  __m512i input = b->chunks[0];
-  const __m512i ascii_space_tbl = _mm512_set_epi8(
-      0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10,
-      9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0,
-      0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32);
-  __m512i lookup0;
-  if (base64_url) {
-    lookup0 = _mm512_set_epi8(
-        -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
-        52, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128, -128,
-        -128, -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128,
-        -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1,
-        -128, -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -1);
-  } else {
-    lookup0 = _mm512_set_epi8(
-        -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
-        52, 63, -128, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128,
-        -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, -128,
-        -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128,
-        -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -128);
-  }
-  __m512i lookup1;
-  if (base64_url) {
-    lookup1 = _mm512_set_epi8(
-        -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
-        41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
-        63, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
-        14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
-  } else {
-    lookup1 = _mm512_set_epi8(
-        -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
-        41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
-        -128, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
-        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
-  }
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *input, size_t length) const noexcept {
+  return utf8::count_code_points(input, length);
+}
 
-  const __m512i translated = _mm512_permutex2var_epi8(lookup0, input, lookup1);
-  const __m512i combined = _mm512_or_si512(translated, input);
-  const __mmask64 mask = _mm512_movepi8_mask(combined);
-  if (mask) {
-    const __mmask64 spaces = _mm512_cmpeq_epi8_mask(
-        _mm512_shuffle_epi8(ascii_space_tbl, input), input);
-    *error = (mask ^ spaces);
-  }
-  b->chunks[0] = translated;
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input,
+                                                                   length);
+}
 
-  return mask;
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
 }
 
-static inline void copy_block(block64 *b, char *output) {
-  _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), b->chunks[0]);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input,
+                                                                    length);
 }
 
-static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
-  uint64_t nmask = ~mask;
-  __m512i c = _mm512_maskz_compress_epi8(nmask, b->chunks[0]);
-  _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), c);
-  return _mm_popcnt_u64(nmask);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
 }
 
-// The caller of this function is responsible to ensure that there are 64 bytes
-// available from reading at src. The data is read into a block64 structure.
-static inline void load_block(block64 *b, const char *src) {
-  b->chunks[0] = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return scalar::utf8::utf16_length_from_utf8(input, length);
 }
 
-// The caller of this function is responsible to ensure that there are 128 bytes
-// available from reading at src. The data is read into a block64 structure.
-static inline void load_block(block64 *b, const char16_t *src) {
-  __m512i m1 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
-  __m512i m2 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src + 32));
-  __m512i p = _mm512_packus_epi16(m1, m2);
-  b->chunks[0] =
-      _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  return scalar::utf32::utf8_length_from_utf32(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  return scalar::utf32::utf16_length_from_utf32(input, length);
 }
 
-static inline void base64_decode(char *out, __m512i str) {
-  const __m512i merge_ab_and_bc =
-      _mm512_maddubs_epi16(str, _mm512_set1_epi32(0x01400140));
-  const __m512i merged =
-      _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000));
-  const __m512i pack = _mm512_set_epi8(
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 61, 62, 56, 57, 58,
-      52, 53, 54, 48, 49, 50, 44, 45, 46, 40, 41, 42, 36, 37, 38, 32, 33, 34,
-      28, 29, 30, 24, 25, 26, 20, 21, 22, 16, 17, 18, 12, 13, 14, 8, 9, 10, 4,
-      5, 6, 0, 1, 2);
-  const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged);
-  _mm512_mask_storeu_epi8(
-      (__m512i *)out, 0xffffffffffff,
-      shuffled); // mask would be 0xffffffffffff since we write 48 bytes.
-}
-// decode 64 bytes and output 48 bytes
-static inline void base64_decode_block(char *out, const char *src) {
-  base64_decode(out,
-                _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src)));
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return scalar::utf8::count_code_points(input, length);
 }
-static inline void base64_decode_block(char *out, block64 *b) {
-  base64_decode(out, b->chunks[0]);
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+    const char *input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-template <bool base64_url, typename chartype>
-full_result
-compress_decode_base64(char *dst, const chartype *src, size_t srclen,
-                       base64_options options,
-                       last_chunk_handling_options last_chunk_options) {
-  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
-                                        : tables::base64::to_base64_value;
-  size_t equallocation =
-      srclen; // location of the first padding character if any
-  size_t equalsigns = 0;
+simdutf_warn_unused result implementation::base64_to_binary(
+    const char *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
   // skip trailing spaces
-  while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
-         to_base64[uint8_t(src[srclen - 1])] == 64) {
-    srclen--;
+  while (length > 0 &&
+         scalar::base64::is_ascii_white_space(input[length - 1])) {
+    length--;
   }
-  if (srclen > 0 && src[srclen - 1] == '=') {
-    equallocation = srclen - 1;
-    srclen--;
-    equalsigns = 1;
-    // skip trailing spaces
-    while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
-           to_base64[uint8_t(src[srclen - 1])] == 64) {
-      srclen--;
+  size_t equallocation =
+      length; // location of the first padding character if any
+  size_t equalsigns = 0;
+  if (length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
+    length -= 1;
+    equalsigns++;
+    while (length > 0 &&
+           scalar::base64::is_ascii_white_space(input[length - 1])) {
+      length--;
     }
-    if (srclen > 0 && src[srclen - 1] == '=') {
-      equallocation = srclen - 1;
-      srclen--;
-      equalsigns = 2;
+    if (length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
+      equalsigns++;
+      length -= 1;
     }
   }
-  if (srclen == 0) {
+  if (length == 0) {
     if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation, 0};
+      return {INVALID_BASE64_CHARACTER, equallocation};
     }
-    return {SUCCESS, 0, 0};
+    return {SUCCESS, 0};
   }
-  const chartype *const srcinit = src;
-  const char *const dstinit = dst;
-  const chartype *const srcend = src + srclen;
-
-  // figure out why block_size == 2 is sometimes best???
-  constexpr size_t block_size = 6;
-  char buffer[block_size * 64];
-  char *bufferptr = buffer;
-  if (srclen >= 64) {
-    const chartype *const srcend64 = src + srclen - 64;
-    while (src <= srcend64) {
-      block64 b;
-      load_block(&b, src);
-      src += 64;
-      uint64_t error = 0;
-      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
-      if (error) {
-        src -= 64;
-        size_t error_offset = _tzcnt_u64(error);
-        return {error_code::INVALID_BASE64_CHARACTER,
-                size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
-      }
-      if (badcharmask != 0) {
-        // optimization opportunity: check for simple masks like those made of
-        // continuous 1s followed by continuous 0s. And masks containing a
-        // single bad character.
-        bufferptr += compress_block(&b, badcharmask, bufferptr);
-      } else if (bufferptr != buffer) {
-        copy_block(&b, bufferptr);
-        bufferptr += 64;
-      } else {
-        base64_decode_block(dst, &b);
-        dst += 48;
-      }
-      if (bufferptr >= (block_size - 1) * 64 + buffer) {
-        for (size_t i = 0; i < (block_size - 1); i++) {
-          base64_decode_block(dst, buffer + i * 64);
-          dst += 48;
-        }
-        std::memcpy(buffer, buffer + (block_size - 1) * 64,
-                    64); // 64 might be too much
-        bufferptr -= (block_size - 1) * 64;
-      }
+  result r = scalar::base64::base64_tail_decode(
+      output, input, length, equalsigns, options, last_chunk_options);
+  if (last_chunk_options != stop_before_partial &&
+      r.error == error_code::SUCCESS && equalsigns > 0) {
+    // additional checks
+    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation};
     }
   }
+  return r;
+}
 
-  char *buffer_start = buffer;
-  // Optimization note: if this is almost full, then it is worth our
-  // time, otherwise, we should just decode directly.
-  int last_block = (int)((bufferptr - buffer_start) % 64);
-  if (last_block != 0 && srcend - src + last_block >= 64) {
-
-    while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
-      uint8_t val = to_base64[uint8_t(*src)];
-      *bufferptr = char(val);
-      if (!scalar::base64::is_eight_byte(*src) || val > 64) {
-        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
-                size_t(dst - dstinit)};
-      }
-      bufferptr += (val <= 63);
-      src++;
-    }
-  }
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
 
-  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
-    base64_decode_block(dst, buffer_start);
-    dst += 48;
+simdutf_warn_unused result implementation::base64_to_binary(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  // skip trailing spaces
+  while (length > 0 &&
+         scalar::base64::is_ascii_white_space(input[length - 1])) {
+    length--;
   }
-  if ((bufferptr - buffer_start) % 64 != 0) {
-    while (buffer_start + 4 < bufferptr) {
-      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
-                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
-                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
-                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
-                        << 8;
-      triple = scalar::utf32::swap_bytes(triple);
-      std::memcpy(dst, &triple, 4);
-      dst += 3;
-      buffer_start += 4;
-    }
-    if (buffer_start + 4 <= bufferptr) {
-      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
-                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
-                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
-                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
-                        << 8;
-      triple = scalar::utf32::swap_bytes(triple);
-      std::memcpy(dst, &triple, 3);
-      dst += 3;
-      buffer_start += 4;
+  size_t equallocation =
+      length; // location of the first padding character if any
+  size_t equalsigns = 0;
+  if (length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
+    length -= 1;
+    equalsigns++;
+    while (length > 0 &&
+           scalar::base64::is_ascii_white_space(input[length - 1])) {
+      length--;
     }
-    // we may have 1, 2 or 3 bytes left and we need to decode them so let us
-    // backtrack
-    int leftover = int(bufferptr - buffer_start);
-    while (leftover > 0) {
-      while (to_base64[uint8_t(*(src - 1))] == 64) {
-        src--;
-      }
-      src--;
-      leftover--;
+    if (length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
+      equalsigns++;
+      length -= 1;
     }
   }
-  if (src < srcend + equalsigns) {
-    full_result r = scalar::base64::base64_tail_decode(
-        dst, src, srcend - src, equalsigns, options, last_chunk_options);
-    r.input_count += size_t(src - srcinit);
-    if (r.error == error_code::INVALID_BASE64_CHARACTER ||
-        r.error == error_code::BASE64_EXTRA_BITS) {
-      return r;
-    } else {
-      r.output_count += size_t(dst - dstinit);
-    }
-    if (last_chunk_options != stop_before_partial &&
-        r.error == error_code::SUCCESS && equalsigns > 0) {
-      // additional checks
-      if ((r.output_count % 3 == 0) ||
-          ((r.output_count % 3) + 1 + equalsigns != 4)) {
-        r.error = error_code::INVALID_BASE64_CHARACTER;
-        r.input_count = equallocation;
-      }
+  if (length == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation};
     }
-    return r;
+    return {SUCCESS, 0};
   }
-  if (equalsigns > 0) {
-    if ((size_t(dst - dstinit) % 3 == 0) ||
-        ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
+  result r = scalar::base64::base64_tail_decode(
+      output, input, length, equalsigns, options, last_chunk_options);
+  if (last_chunk_options != stop_before_partial &&
+      r.error == error_code::SUCCESS && equalsigns > 0) {
+    // additional checks
+    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation};
     }
   }
-  return {SUCCESS, srclen, size_t(dst - dstinit)};
+  return r;
 }
-/* end file src/icelake/icelake_base64.inl.cpp */
 
-#include <cstdint>
+simdutf_warn_unused size_t implementation::base64_length_from_binary(
+    size_t length, base64_options options) const noexcept {
+  return scalar::base64::base64_length_from_binary(length, options);
+}
 
-} // namespace
-} // namespace icelake
+size_t implementation::binary_to_base64(const char *input, size_t length,
+                                        char *output,
+                                        base64_options options) const noexcept {
+  return scalar::base64::binary_to_base64(input, length, output, options);
+}
+} // namespace ppc64
 } // namespace simdutf
 
+/* begin file src/simdutf/ppc64/end.h */
+/* end file src/simdutf/ppc64/end.h */
+/* end file src/ppc64/implementation.cpp */
+#endif
+#if SIMDUTF_IMPLEMENTATION_RVV
+/* begin file src/rvv/implementation.cpp */
+
+
+
+
+
+/* begin file src/simdutf/rvv/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "rvv"
+// #define SIMDUTF_IMPLEMENTATION rvv
+
+#if SIMDUTF_CAN_ALWAYS_RUN_RVV
+// nothing needed.
+#else
+SIMDUTF_TARGET_RVV
+#endif
+/* end file src/simdutf/rvv/begin.h */
 namespace simdutf {
-namespace icelake {
+namespace rvv {
+namespace {
+#ifndef SIMDUTF_RVV_H
+  #error "rvv.h must be included"
+#endif
 
-simdutf_warn_unused int
-implementation::detect_encodings(const char *input,
-                                 size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  // todo: convert to a one-pass algorithm
-  if (bom_encoding != encoding_type::unspecified) {
-    return bom_encoding;
+} // unnamed namespace
+} // namespace rvv
+} // namespace simdutf
+
+//
+// Implementation-specific overrides
+//
+namespace simdutf {
+namespace rvv {
+/* begin file src/rvv/rvv_helpers.inl.cpp */
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static size_t
+rvv_utf32_store_utf16_m4(uint16_t *dst, vuint32m4_t utf32, size_t vl,
+                         vbool4_t m4even) {
+  /* convert [000000000000aaaa|aaaaaabbbbbbbbbb]
+   * to      [110111bbbbbbbbbb|110110aaaaaaaaaa] */
+  vuint32m4_t sur = __riscv_vsub_vx_u32m4(utf32, 0x10000, vl);
+  sur = __riscv_vor_vv_u32m4(__riscv_vsll_vx_u32m4(sur, 16, vl),
+                             __riscv_vsrl_vx_u32m4(sur, 10, vl), vl);
+  sur = __riscv_vand_vx_u32m4(sur, 0x3FF03FF, vl);
+  sur = __riscv_vor_vx_u32m4(sur, 0xDC00D800, vl);
+  /* merge 1 byte utf32 and 2 byte sur */
+  vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(utf32, 0xFFFF, vl);
+  vuint16m4_t utf32_16 = __riscv_vreinterpret_v_u32m4_u16m4(
+      __riscv_vmerge_vvm_u32m4(utf32, sur, m4, vl));
+  /* compress and store */
+  vbool4_t mOut = __riscv_vmor_mm_b4(
+      __riscv_vmsne_vx_u16m4_b4(utf32_16, 0, vl * 2), m4even, vl * 2);
+  vuint16m4_t vout = __riscv_vcompress_vm_u16m4(utf32_16, mOut, vl * 2);
+  vl = __riscv_vcpop_m_b4(mOut, vl * 2);
+  __riscv_vse16_v_u16m4(dst, simdutf_byteflip<bflip>(vout, vl), vl);
+  return vl;
+};
+/* end file src/rvv/rvv_helpers.inl.cpp */
+
+/* begin file src/rvv/rvv_length_from.inl.cpp */
+
+simdutf_warn_unused size_t
+implementation::count_utf16le(const char16_t *src, size_t len) const noexcept {
+  return utf32_length_from_utf16le(src, len);
+}
+
+simdutf_warn_unused size_t
+implementation::count_utf16be(const char16_t *src, size_t len) const noexcept {
+  return utf32_length_from_utf16be(src, len);
+}
+
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *src, size_t len) const noexcept {
+  return utf32_length_from_utf8(src, len);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
+    const char *src, size_t len) const noexcept {
+  return utf32_length_from_utf8(src, len);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf16(size_t len) const noexcept {
+  return len;
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf32(size_t len) const noexcept {
+  return len;
+}
+
+simdutf_warn_unused size_t
+implementation::utf16_length_from_latin1(size_t len) const noexcept {
+  return len;
+}
+
+simdutf_warn_unused size_t
+implementation::utf32_length_from_latin1(size_t len) const noexcept {
+  return len;
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+    const char *src, size_t len) const noexcept {
+  size_t count = 0;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e8m8(len);
+    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
+    vbool1_t mask = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl);
+    count += __riscv_vcpop_m_b1(mask, vl);
   }
-  int out = 0;
-  if (validate_utf8(input, length)) {
-    out |= encoding_type::UTF8;
+  return count;
+}
+
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static size_t
+rvv_utf32_length_from_utf16(const char16_t *src, size_t len) {
+  size_t count = 0;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e16m8(len);
+    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
+    v = simdutf_byteflip<bflip>(v, vl);
+    vbool2_t notHigh =
+        __riscv_vmor_mm_b2(__riscv_vmsgtu_vx_u16m8_b2(v, 0xDFFF, vl),
+                           __riscv_vmsltu_vx_u16m8_b2(v, 0xDC00, vl), vl);
+    count += __riscv_vcpop_m_b2(notHigh, vl);
   }
-  if ((length % 2) == 0) {
-    if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
-                         length / 2)) {
-      out |= encoding_type::UTF16_LE;
-    }
+  return count;
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+    const char16_t *src, size_t len) const noexcept {
+  return rvv_utf32_length_from_utf16<simdutf_ByteFlip::NONE>(src, len);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+    const char16_t *src, size_t len) const noexcept {
+  if (supports_zvbb())
+    return rvv_utf32_length_from_utf16<simdutf_ByteFlip::ZVBB>(src, len);
+  else
+    return rvv_utf32_length_from_utf16<simdutf_ByteFlip::V>(src, len);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+    const char *src, size_t len) const noexcept {
+  size_t count = len;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e8m8(len);
+    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
+    count += __riscv_vcpop_m_b1(__riscv_vmslt_vx_i8m8_b1(v, 0, vl), vl);
   }
-  if ((length % 4) == 0) {
-    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
-      out |= encoding_type::UTF32_LE;
-    }
+  return count;
+}
+
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static size_t
+rvv_utf8_length_from_utf16(const char16_t *src, size_t len) {
+  size_t count = 0;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e16m8(len);
+    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
+    v = simdutf_byteflip<bflip>(v, vl);
+    vbool2_t m234 = __riscv_vmsgtu_vx_u16m8_b2(v, 0x7F, vl);
+    vbool2_t m34 = __riscv_vmsgtu_vx_u16m8_b2(v, 0x7FF, vl);
+    vbool2_t notSur =
+        __riscv_vmor_mm_b2(__riscv_vmsltu_vx_u16m8_b2(v, 0xD800, vl),
+                           __riscv_vmsgtu_vx_u16m8_b2(v, 0xDFFF, vl), vl);
+    vbool2_t m3 = __riscv_vmand_mm_b2(m34, notSur, vl);
+    count += vl + __riscv_vcpop_m_b2(m234, vl) + __riscv_vcpop_m_b2(m3, vl);
+  }
+  return count;
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+    const char16_t *src, size_t len) const noexcept {
+  return rvv_utf8_length_from_utf16<simdutf_ByteFlip::NONE>(src, len);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+    const char16_t *src, size_t len) const noexcept {
+  if (supports_zvbb())
+    return rvv_utf8_length_from_utf16<simdutf_ByteFlip::ZVBB>(src, len);
+  else
+    return rvv_utf8_length_from_utf16<simdutf_ByteFlip::V>(src, len);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+    const char32_t *src, size_t len) const noexcept {
+  size_t count = 0;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e32m8(len);
+    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
+    vbool4_t m234 = __riscv_vmsgtu_vx_u32m8_b4(v, 0x7F, vl);
+    vbool4_t m34 = __riscv_vmsgtu_vx_u32m8_b4(v, 0x7FF, vl);
+    vbool4_t m4 = __riscv_vmsgtu_vx_u32m8_b4(v, 0xFFFF, vl);
+    count += vl + __riscv_vcpop_m_b4(m234, vl) + __riscv_vcpop_m_b4(m34, vl) +
+             __riscv_vcpop_m_b4(m4, vl);
+  }
+  return count;
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+    const char *src, size_t len) const noexcept {
+  size_t count = 0;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e8m8(len);
+    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
+    vbool1_t m1234 = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl);
+    vbool1_t m4 = __riscv_vmsgtu_vx_u8m8_b1(__riscv_vreinterpret_u8m8(v),
+                                            (uint8_t)0b11101111, vl);
+    count += __riscv_vcpop_m_b1(m1234, vl) + __riscv_vcpop_m_b1(m4, vl);
+  }
+  return count;
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+    const char32_t *src, size_t len) const noexcept {
+  size_t count = 0;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e32m8(len);
+    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
+    vbool4_t m4 = __riscv_vmsgtu_vx_u32m8_b4(v, 0xFFFF, vl);
+    count += vl + __riscv_vcpop_m_b4(m4, vl);
+  }
+  return count;
+}
+/* end file src/rvv/rvv_length_from.inl.cpp */
+/* begin file src/rvv/rvv_validate.inl.cpp */
+
+
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *src, size_t len) const noexcept {
+  size_t vlmax = __riscv_vsetvlmax_e8m8();
+  vint8m8_t mask = __riscv_vmv_v_x_i8m8(0, vlmax);
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e8m8(len);
+    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
+    mask = __riscv_vor_vv_i8m8_tu(mask, mask, v, vl);
   }
-  return out;
+  return __riscv_vfirst_m_b1(__riscv_vmslt_vx_i8m8_b1(mask, 0, vlmax), vlmax) <
+         0;
 }
 
-simdutf_warn_unused bool
-implementation::validate_utf8(const char *buf, size_t len) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
-    return true;
-  }
-  avx512_utf8_checker checker{};
-  const char *ptr = buf;
-  const char *end = ptr + len;
-  for (; end - ptr >= 64; ptr += 64) {
-    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
-    checker.check_next_input(utf8);
-  }
-  if (end != ptr) {
-    const __m512i utf8 = _mm512_maskz_loadu_epi8(
-        ~UINT64_C(0) >> (64 - (end - ptr)), (const __m512i *)ptr);
-    checker.check_next_input(utf8);
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+    const char *src, size_t len) const noexcept {
+  const char *beg = src;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e8m8(len);
+    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
+    long idx = __riscv_vfirst_m_b1(__riscv_vmslt_vx_i8m8_b1(v, 0, vl), vl);
+    if (idx >= 0)
+      return result(error_code::TOO_LARGE, src - beg + idx);
   }
-  checker.check_eof();
-  return !checker.errors();
+  return result(error_code::SUCCESS, src - beg);
 }
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(
-    const char *buf, size_t len) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
-    return result(error_code::SUCCESS, len);
-  }
-  avx512_utf8_checker checker{};
-  const char *ptr = buf;
-  const char *end = ptr + len;
-  size_t count{0};
-  for (; end - ptr >= 64; ptr += 64) {
-    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
-    checker.check_next_input(utf8);
-    if (checker.errors()) {
-      if (count != 0) {
-        count--;
-      } // Sometimes the error is only detected in the next chunk
-      result res = scalar::utf8::rewind_and_validate_with_errors(
-          reinterpret_cast<const char *>(buf),
-          reinterpret_cast<const char *>(buf + count), len - count);
-      res.count += count;
-      return res;
-    }
-    count += 64;
-  }
-  if (end != ptr) {
-    const __m512i utf8 = _mm512_maskz_loadu_epi8(
-        ~UINT64_C(0) >> (64 - (end - ptr)), (const __m512i *)ptr);
-    checker.check_next_input(utf8);
+/* Returns a close estimation of the number of valid UTF-8 bytes up to the
+ * first invalid one, but never overestimating. */
+simdutf_really_inline static size_t rvv_count_valid_utf8(const char *src,
+                                                         size_t len) {
+  const char *beg = src;
+  if (len < 32)
+    return 0;
+
+  /* validate first three bytes */
+  {
+    size_t idx = 3;
+    while (idx < len && (src[idx] >> 6) == 0b10)
+      ++idx;
+    if (idx > 3 + 3 || !scalar::utf8::validate(src, idx))
+      return 0;
   }
-  checker.check_eof();
-  if (checker.errors()) {
-    if (count != 0) {
-      count--;
-    } // Sometimes the error is only detected in the next chunk
-    result res = scalar::utf8::rewind_and_validate_with_errors(
-        reinterpret_cast<const char *>(buf),
-        reinterpret_cast<const char *>(buf + count), len - count);
-    res.count += count;
-    return res;
+
+  static const uint64_t err1m[] = {0x0202020202020202, 0x4915012180808080};
+  static const uint64_t err2m[] = {0xCBCBCB8B8383A3E7, 0xCBCBDBCBCBCBCBCB};
+  static const uint64_t err3m[] = {0x0101010101010101, 0X01010101BABAAEE6};
+
+  const vuint8m1_t err1tbl =
+      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2));
+  const vuint8m1_t err2tbl =
+      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2));
+  const vuint8m1_t err3tbl =
+      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2));
+
+  size_t tail = 3;
+  size_t n = len - tail;
+
+  for (size_t vl; n > 0; n -= vl, src += vl) {
+    vl = __riscv_vsetvl_e8m4(n);
+    vuint8m4_t v0 = __riscv_vle8_v_u8m4((uint8_t const *)src, vl);
+
+    uint8_t next0 = src[vl + 0];
+    uint8_t next1 = src[vl + 1];
+    uint8_t next2 = src[vl + 2];
+
+    /* fast path: ASCII */
+    if (__riscv_vfirst_m_b2(__riscv_vmsgtu_vx_u8m4_b2(v0, 0b01111111, vl), vl) <
+            0 &&
+        (next0 | next1 | next2) < 0b10000000)
+      continue;
+
+    /* see "Validating UTF-8 In Less Than One Instruction Per Byte"
+     * https://arxiv.org/abs/2010.03090 */
+    vuint8m4_t v1 = __riscv_vslide1down_vx_u8m4(v0, next0, vl);
+    vuint8m4_t v2 = __riscv_vslide1down_vx_u8m4(v1, next1, vl);
+    vuint8m4_t v3 = __riscv_vslide1down_vx_u8m4(v2, next2, vl);
+
+    vuint8m4_t s1 = __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vsrl_vx_u16m4(
+        __riscv_vreinterpret_v_u8m4_u16m4(v2), 4, __riscv_vsetvlmax_e16m4()));
+    vuint8m4_t s3 = __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vsrl_vx_u16m4(
+        __riscv_vreinterpret_v_u8m4_u16m4(v3), 4, __riscv_vsetvlmax_e16m4()));
+
+    vuint8m4_t idx2 = __riscv_vand_vx_u8m4(v2, 0xF, vl);
+    vuint8m4_t idx1 = __riscv_vand_vx_u8m4(s1, 0xF, vl);
+    vuint8m4_t idx3 = __riscv_vand_vx_u8m4(s3, 0xF, vl);
+
+    vuint8m4_t err1 = simdutf_vrgather_u8m1x4(err1tbl, idx1);
+    vuint8m4_t err2 = simdutf_vrgather_u8m1x4(err2tbl, idx2);
+    vuint8m4_t err3 = simdutf_vrgather_u8m1x4(err3tbl, idx3);
+    vint8m4_t errs = __riscv_vreinterpret_v_u8m4_i8m4(
+        __riscv_vand_vv_u8m4(__riscv_vand_vv_u8m4(err1, err2, vl), err3, vl));
+
+    vbool2_t is_3 = __riscv_vmsgtu_vx_u8m4_b2(v1, 0b11100000 - 1, vl);
+    vbool2_t is_4 = __riscv_vmsgtu_vx_u8m4_b2(v0, 0b11110000 - 1, vl);
+    vbool2_t is_34 = __riscv_vmor_mm_b2(is_3, is_4, vl);
+    vbool2_t err34 =
+        __riscv_vmxor_mm_b2(is_34, __riscv_vmslt_vx_i8m4_b2(errs, 0, vl), vl);
+    vbool2_t errm =
+        __riscv_vmor_mm_b2(__riscv_vmsgt_vx_i8m4_b2(errs, 0, vl), err34, vl);
+    if (__riscv_vfirst_m_b2(errm, vl) >= 0)
+      break;
   }
-  return result(error_code::SUCCESS, len);
+
+  /* we need to validate the last character */
+  while (tail < len && (src[0] >> 6) == 0b10)
+    --src, ++tail;
+  return src - beg;
 }
 
 simdutf_warn_unused bool
-implementation::validate_ascii(const char *buf, size_t len) const noexcept {
-  return icelake::validate_ascii(buf, len);
+implementation::validate_utf8(const char *src, size_t len) const noexcept {
+  size_t count = rvv_count_valid_utf8(src, len);
+  return scalar::utf8::validate(src + count, len - count);
 }
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(
-    const char *buf, size_t len) const noexcept {
-  const char *buf_orig = buf;
-  const char *end = buf + len;
-  const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
-  for (; end - buf >= 64; buf += 64) {
-    const __m512i input = _mm512_loadu_si512((const __m512i *)buf);
-    __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
-    if (notascii) {
-      return result(error_code::TOO_LARGE,
-                    buf - buf_orig + _tzcnt_u64(notascii));
-    }
-  }
-  if (end != buf) {
-    const __m512i input = _mm512_maskz_loadu_epi8(
-        ~UINT64_C(0) >> (64 - (end - buf)), (const __m512i *)buf);
-    __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
-    if (notascii) {
-      return result(error_code::TOO_LARGE,
-                    buf - buf_orig + _tzcnt_u64(notascii));
-    }
-  }
-  return result(error_code::SUCCESS, len);
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+    const char *src, size_t len) const noexcept {
+  size_t count = rvv_count_valid_utf8(src, len);
+  result res = scalar::utf8::validate_with_errors(src + count, len - count);
+  return result(res.error, count + res.count);
 }
 
 simdutf_warn_unused bool
-implementation::validate_utf16le(const char16_t *buf,
+implementation::validate_utf16le(const char16_t *src,
                                  size_t len) const noexcept {
-  const char16_t *end = buf + len;
-
-  for (; end - buf >= 32;) {
-    __m512i in = _mm512_loadu_si512((__m512i *)buf);
-    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-    __mmask32 surrogates =
-        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-    if (surrogates) {
-      __mmask32 highsurrogates =
-          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-      // high must be followed by low
-      if ((highsurrogates << 1) != lowsurrogates) {
-        return false;
-      }
-      bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-      if (ends_with_high) {
-        buf += 31; // advance only by 31 code units so that we start with the
-                   // high surrogate on the next round.
-      } else {
-        buf += 32;
-      }
-    } else {
-      buf += 32;
-    }
-  }
-  if (buf < end) {
-    __m512i in =
-        _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf);
-    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-    __mmask32 surrogates =
-        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-    if (surrogates) {
-      __mmask32 highsurrogates =
-          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-      // high must be followed by low
-      if ((highsurrogates << 1) != lowsurrogates) {
-        return false;
-      }
-    }
-  }
-  return true;
+  return validate_utf16le_with_errors(src, len).error == error_code::SUCCESS;
 }
 
 simdutf_warn_unused bool
-implementation::validate_utf16be(const char16_t *buf,
+implementation::validate_utf16be(const char16_t *src,
                                  size_t len) const noexcept {
-  const char16_t *end = buf + len;
-  const __m512i byteflip = _mm512_setr_epi64(
-      0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
-      0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
-      0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  for (; end - buf >= 32;) {
-    __m512i in =
-        _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i *)buf), byteflip);
-    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-    __mmask32 surrogates =
-        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-    if (surrogates) {
-      __mmask32 highsurrogates =
-          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-      // high must be followed by low
-      if ((highsurrogates << 1) != lowsurrogates) {
-        return false;
-      }
-      bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-      if (ends_with_high) {
-        buf += 31; // advance only by 31 code units so that we start with the
-                   // high surrogate on the next round.
-      } else {
-        buf += 32;
-      }
-    } else {
-      buf += 32;
+  return validate_utf16be_with_errors(src, len).error == error_code::SUCCESS;
+}
+
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static result
+rvv_validate_utf16_with_errors(const char16_t *src, size_t len) {
+  const char16_t *beg = src;
+  uint16_t last = 0;
+  for (size_t vl; len > 0;
+       len -= vl, src += vl, last = simdutf_byteflip<bflip>(src[-1])) {
+    vl = __riscv_vsetvl_e16m8(len);
+    vuint16m8_t v1 = __riscv_vle16_v_u16m8((const uint16_t *)src, vl);
+    v1 = simdutf_byteflip<bflip>(v1, vl);
+    vuint16m8_t v0 = __riscv_vslide1up_vx_u16m8(v1, last, vl);
+
+    vbool2_t surhi = __riscv_vmseq_vx_u16m8_b2(
+        __riscv_vand_vx_u16m8(v0, 0xFC00, vl), 0xD800, vl);
+    vbool2_t surlo = __riscv_vmseq_vx_u16m8_b2(
+        __riscv_vand_vx_u16m8(v1, 0xFC00, vl), 0xDC00, vl);
+
+    long idx = __riscv_vfirst_m_b2(__riscv_vmxor_mm_b2(surhi, surlo, vl), vl);
+    if (idx >= 0) {
+      last = idx > 0 ? simdutf_byteflip<bflip>(src[idx - 1]) : last;
+      return result(error_code::SURROGATE,
+                    src - beg + idx - (last - 0xD800u < 0x400u));
+      break;
     }
   }
-  if (buf < end) {
-    __m512i in = _mm512_shuffle_epi8(
-        _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf),
-        byteflip);
-    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-    __mmask32 surrogates =
-        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-    if (surrogates) {
-      __mmask32 highsurrogates =
-          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-      // high must be followed by low
-      if ((highsurrogates << 1) != lowsurrogates) {
-        return false;
-      }
-    }
+  if (last - 0xD800u < 0x400u) {
+    return result(error_code::SURROGATE,
+                  src - beg - 1); /* end on high surrogate */
+  } else {
+    return result(error_code::SUCCESS, src - beg);
   }
-  return true;
 }
 
 simdutf_warn_unused result implementation::validate_utf16le_with_errors(
-    const char16_t *buf, size_t len) const noexcept {
-  const char16_t *start_buf = buf;
-  const char16_t *end = buf + len;
-  for (; end - buf >= 32;) {
-    __m512i in = _mm512_loadu_si512((__m512i *)buf);
-    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-    __mmask32 surrogates =
-        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-    if (surrogates) {
-      __mmask32 highsurrogates =
-          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-      // high must be followed by low
-      if ((highsurrogates << 1) != lowsurrogates) {
-        uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
-        uint32_t extra_high =
-            _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
-        return result(error_code::SURROGATE,
-                      (buf - start_buf) +
-                          (extra_low < extra_high ? extra_low : extra_high));
-      }
-      bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-      if (ends_with_high) {
-        buf += 31; // advance only by 31 code units so that we start with the
-                   // high surrogate on the next round.
-      } else {
-        buf += 32;
-      }
-    } else {
-      buf += 32;
-    }
-  }
-  if (buf < end) {
-    __m512i in =
-        _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf);
-    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-    __mmask32 surrogates =
-        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-    if (surrogates) {
-      __mmask32 highsurrogates =
-          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-      // high must be followed by low
-      if ((highsurrogates << 1) != lowsurrogates) {
-        uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
-        uint32_t extra_high =
-            _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
-        return result(error_code::SURROGATE,
-                      (buf - start_buf) +
-                          (extra_low < extra_high ? extra_low : extra_high));
-      }
-    }
-  }
-  return result(error_code::SUCCESS, len);
+    const char16_t *src, size_t len) const noexcept {
+  return rvv_validate_utf16_with_errors<simdutf_ByteFlip::NONE>(src, len);
 }
 
 simdutf_warn_unused result implementation::validate_utf16be_with_errors(
-    const char16_t *buf, size_t len) const noexcept {
-  const char16_t *start_buf = buf;
-  const char16_t *end = buf + len;
-  const __m512i byteflip = _mm512_setr_epi64(
-      0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
-      0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
-      0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  for (; end - buf >= 32;) {
-    __m512i in =
-        _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i *)buf), byteflip);
-    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-    __mmask32 surrogates =
-        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-    if (surrogates) {
-      __mmask32 highsurrogates =
-          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-      // high must be followed by low
-      if ((highsurrogates << 1) != lowsurrogates) {
-        uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
-        uint32_t extra_high =
-            _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
-        return result(error_code::SURROGATE,
-                      (buf - start_buf) +
-                          (extra_low < extra_high ? extra_low : extra_high));
-      }
-      bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-      if (ends_with_high) {
-        buf += 31; // advance only by 31 code units so that we start with the
-                   // high surrogate on the next round.
-      } else {
-        buf += 32;
-      }
-    } else {
-      buf += 32;
-    }
-  }
-  if (buf < end) {
-    __m512i in = _mm512_shuffle_epi8(
-        _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf),
-        byteflip);
-    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-    __mmask32 surrogates =
-        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-    if (surrogates) {
-      __mmask32 highsurrogates =
-          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-      // high must be followed by low
-      if ((highsurrogates << 1) != lowsurrogates) {
-        uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
-        uint32_t extra_high =
-            _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
-        return result(error_code::SURROGATE,
-                      (buf - start_buf) +
-                          (extra_low < extra_high ? extra_low : extra_high));
-      }
-    }
-  }
-  return result(error_code::SUCCESS, len);
+    const char16_t *src, size_t len) const noexcept {
+  if (supports_zvbb())
+    return rvv_validate_utf16_with_errors<simdutf_ByteFlip::ZVBB>(src, len);
+  else
+    return rvv_validate_utf16_with_errors<simdutf_ByteFlip::V>(src, len);
 }
 
 simdutf_warn_unused bool
-implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
-  const char32_t *tail = icelake::validate_utf32(buf, len);
-  if (tail) {
-    return scalar::utf32::validate(tail, len - (tail - buf));
-  } else {
-    // we come here if there was an error, or buf was nullptr which may happen
-    // for empty input.
-    return len == 0;
+implementation::validate_utf32(const char32_t *src, size_t len) const noexcept {
+  size_t vlmax = __riscv_vsetvlmax_e32m8();
+  vuint32m8_t max = __riscv_vmv_v_x_u32m8(0x10FFFF, vlmax);
+  vuint32m8_t maxOff = __riscv_vmv_v_x_u32m8(0xFFFFF7FF, vlmax);
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e32m8(len);
+    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
+    vuint32m8_t off = __riscv_vadd_vx_u32m8(v, 0xFFFF2000, vl);
+    max = __riscv_vmaxu_vv_u32m8_tu(max, max, v, vl);
+    maxOff = __riscv_vmaxu_vv_u32m8_tu(maxOff, maxOff, off, vl);
   }
+  return __riscv_vfirst_m_b4(
+             __riscv_vmor_mm_b4(
+                 __riscv_vmsne_vx_u32m8_b4(max, 0x10FFFF, vlmax),
+                 __riscv_vmsne_vx_u32m8_b4(maxOff, 0xFFFFF7FF, vlmax), vlmax),
+             vlmax) < 0;
 }
 
 simdutf_warn_unused result implementation::validate_utf32_with_errors(
-    const char32_t *buf, size_t len) const noexcept {
-  const char32_t *buf_orig = buf;
-  if (len >= 16) {
-    const char32_t *end = buf + len - 16;
-    while (buf <= end) {
-      __m512i utf32 = _mm512_loadu_si512((const __m512i *)buf);
-      __mmask16 outside_range = _mm512_cmp_epu32_mask(
-          utf32, _mm512_set1_epi32(0x10ffff), _MM_CMPINT_GT);
-
-      __m512i utf32_off =
-          _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
-
-      __mmask16 surrogate_range = _mm512_cmp_epu32_mask(
-          utf32_off, _mm512_set1_epi32(0xfffff7ff), _MM_CMPINT_GT);
-      if ((outside_range | surrogate_range)) {
-        auto outside_idx = _tzcnt_u32(outside_range);
-        auto surrogate_idx = _tzcnt_u32(surrogate_range);
-
-        if (outside_idx < surrogate_idx) {
-          return result(error_code::TOO_LARGE, buf - buf_orig + outside_idx);
-        }
-
-        return result(error_code::SURROGATE, buf - buf_orig + surrogate_idx);
-      }
-
-      buf += 16;
-    }
-  }
-  if (len > 0) {
-    __m512i utf32 = _mm512_maskz_loadu_epi32(
-        __mmask16((1U << (buf_orig + len - buf)) - 1), (const __m512i *)buf);
-    __mmask16 outside_range = _mm512_cmp_epu32_mask(
-        utf32, _mm512_set1_epi32(0x10ffff), _MM_CMPINT_GT);
-    __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
-
-    __mmask16 surrogate_range = _mm512_cmp_epu32_mask(
-        utf32_off, _mm512_set1_epi32(0xfffff7ff), _MM_CMPINT_GT);
-    if ((outside_range | surrogate_range)) {
-      auto outside_idx = _tzcnt_u32(outside_range);
-      auto surrogate_idx = _tzcnt_u32(surrogate_range);
-
-      if (outside_idx < surrogate_idx) {
-        return result(error_code::TOO_LARGE, buf - buf_orig + outside_idx);
+    const char32_t *src, size_t len) const noexcept {
+  const char32_t *beg = src;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e32m8(len);
+    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
+    vuint32m8_t off = __riscv_vadd_vx_u32m8(v, 0xFFFF2000, vl);
+    long idx1 =
+        __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 0x10FFFF, vl), vl);
+    long idx2 = __riscv_vfirst_m_b4(
+        __riscv_vmsgtu_vx_u32m8_b4(off, 0xFFFFF7FF, vl), vl);
+    if (idx1 >= 0 && idx2 >= 0) {
+      if (idx1 <= idx2) {
+        return result(error_code::TOO_LARGE, src - beg + idx1);
+      } else {
+        return result(error_code::SURROGATE, src - beg + idx2);
       }
-
-      return result(error_code::SURROGATE, buf - buf_orig + surrogate_idx);
+    }
+    if (idx1 >= 0) {
+      return result(error_code::TOO_LARGE, src - beg + idx1);
+    }
+    if (idx2 >= 0) {
+      return result(error_code::SURROGATE, src - beg + idx2);
     }
   }
-
-  return result(error_code::SUCCESS, len);
+  return result(error_code::SUCCESS, src - beg);
 }
+/* end file src/rvv/rvv_validate.inl.cpp */
+
+/* begin file src/rvv/rvv_latin1_to.inl.cpp */
 
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
-    const char *buf, size_t len, char *utf8_output) const noexcept {
-  return icelake::latin1_to_utf8_avx512_start(buf, len, utf8_output);
+    const char *src, size_t len, char *dst) const noexcept {
+  char *beg = dst;
+  for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) {
+    vl = __riscv_vsetvl_e8m2(len);
+    vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
+    vbool4_t nascii =
+        __riscv_vmslt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v1), 0, vl);
+    size_t cnt = __riscv_vcpop_m_b4(nascii, vl);
+    vlOut = vl + cnt;
+    if (cnt == 0) {
+      __riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut);
+      continue;
+    }
+
+    vuint8m2_t v0 =
+        __riscv_vor_vx_u8m2(__riscv_vsrl_vx_u8m2(v1, 6, vl), 0b11000000, vl);
+    v1 = __riscv_vand_vx_u8m2_mu(nascii, v1, v1, 0b10111111, vl);
+
+    vuint8m4_t wide =
+        __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vwmaccu_vx_u16m4(
+            __riscv_vwaddu_vv_u16m4(v0, v1, vl), 0xFF, v1, vl));
+    vbool2_t mask = __riscv_vmsgtu_vx_u8m4_b2(
+        __riscv_vsub_vx_u8m4(wide, 0b11000000, vl * 2), 1, vl * 2);
+    vuint8m4_t comp = __riscv_vcompress_vm_u8m4(wide, mask, vl * 2);
+
+    __riscv_vse8_v_u8m4((uint8_t *)dst, comp, vlOut);
+  }
+  return dst - beg;
 }
 
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return icelake_convert_latin1_to_utf16<endianness::LITTLE>(buf, len,
-                                                             utf16_output);
+    const char *src, size_t len, char16_t *dst) const noexcept {
+  char16_t *beg = dst;
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e8m4(len);
+    vuint8m4_t v = __riscv_vle8_v_u8m4((uint8_t *)src, vl);
+    __riscv_vse16_v_u16m8((uint16_t *)dst, __riscv_vzext_vf2_u16m8(v, vl), vl);
+  }
+  return dst - beg;
 }
 
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return icelake_convert_latin1_to_utf16<endianness::BIG>(buf, len,
-                                                          utf16_output);
+    const char *src, size_t len, char16_t *dst) const noexcept {
+  char16_t *beg = dst;
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e8m4(len);
+    vuint8m4_t v = __riscv_vle8_v_u8m4((uint8_t *)src, vl);
+    __riscv_vse16_v_u16m8(
+        (uint16_t *)dst,
+        __riscv_vsll_vx_u16m8(__riscv_vzext_vf2_u16m8(v, vl), 8, vl), vl);
+  }
+  return dst - beg;
 }
 
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
-    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
-  std::pair<const char *, char32_t *> ret =
-      avx512_convert_latin1_to_utf32(buf, len, utf32_output);
-  if (ret.first == nullptr) {
-    return 0;
+    const char *src, size_t len, char32_t *dst) const noexcept {
+  char32_t *beg = dst;
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e8m2(len);
+    vuint8m2_t v = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
+    __riscv_vse32_v_u32m8((uint32_t *)dst, __riscv_vzext_vf4_u32m8(v, vl), vl);
   }
-  size_t converted_chars = ret.second - utf32_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
-        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_converted_chars == 0) {
-      return 0;
-    }
-    converted_chars += scalar_converted_chars;
+  return dst - beg;
+}
+/* end file src/rvv/rvv_latin1_to.inl.cpp */
+/* begin file src/rvv/rvv_utf16_to.inl.cpp */
+#include <cstdio>
+
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static result
+rvv_utf16_to_latin1_with_errors(const char16_t *src, size_t len, char *dst) {
+  const char16_t *const beg = src;
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e16m8(len);
+    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
+    v = simdutf_byteflip<bflip>(v, vl);
+    long idx = __riscv_vfirst_m_b2(__riscv_vmsgtu_vx_u16m8_b2(v, 255, vl), vl);
+    if (idx >= 0)
+      return result(error_code::TOO_LARGE, src - beg + idx);
+    __riscv_vse8_v_u8m4((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m4(v, vl), vl);
   }
-  return converted_chars;
+  return result(error_code::SUCCESS, src - beg);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
-    const char *buf, size_t len, char *latin1_output) const noexcept {
-  return icelake::utf8_to_latin1_avx512(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  result res = convert_utf16le_to_latin1_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
-    const char *buf, size_t len, char *latin1_output) const noexcept {
-  // First, try to convert as much as possible using the SIMD implementation.
-  const char *obuf = buf;
-  char *olatin1_output = latin1_output;
-  size_t written = icelake::utf8_to_latin1_avx512(obuf, len, olatin1_output);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  result res = convert_utf16be_to_latin1_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
+}
 
-  // If we have completely converted the string
-  if (obuf == buf + len) {
-    return {simdutf::SUCCESS, written};
-  }
-  size_t pos = obuf - buf;
-  result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
-      pos, buf + pos, len - pos, latin1_output);
-  res.count += pos;
-  return res;
+simdutf_warn_unused result
+implementation::convert_utf16le_to_latin1_with_errors(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  return rvv_utf16_to_latin1_with_errors<simdutf_ByteFlip::NONE>(src, len, dst);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
-    const char *buf, size_t len, char *latin1_output) const noexcept {
-  return icelake::valid_utf8_to_latin1_avx512(buf, len, latin1_output);
+simdutf_warn_unused result
+implementation::convert_utf16be_to_latin1_with_errors(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_utf16_to_latin1_with_errors<simdutf_ByteFlip::ZVBB>(src, len,
+                                                                   dst);
+  else
+    return rvv_utf16_to_latin1_with_errors<simdutf_ByteFlip::V>(src, len, dst);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  utf8_to_utf16_result ret =
-      fast_avx512_convert_utf8_to_utf16<endianness::LITTLE>(buf, len,
-                                                            utf16_output);
-  if (ret.second == nullptr) {
-    return 0;
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  const char16_t *const beg = src;
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e16m8(len);
+    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
+    __riscv_vse8_v_u8m4((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m4(v, vl), vl);
   }
-  return ret.second - utf16_output;
+  return src - beg;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::BIG>(
-      buf, len, utf16_output);
-  if (ret.second == nullptr) {
-    return 0;
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  const char16_t *const beg = src;
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e16m8(len);
+    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
+    __riscv_vse8_v_u8m4((uint8_t *)dst, __riscv_vnsrl_wx_u8m4(v, 8, vl), vl);
   }
-  return ret.second - utf16_output;
+  return src - beg;
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::LITTLE>(
-      buf, len, utf16_output);
-}
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static result
+rvv_utf16_to_utf8_with_errors(const char16_t *src, size_t len, char *dst) {
+  size_t n = len;
+  const char16_t *srcBeg = src;
+  const char *dstBeg = dst;
+  size_t vl8m4 = __riscv_vsetvlmax_e8m4();
+  vbool2_t m4mulp2 = __riscv_vmseq_vx_u8m4_b2(
+      __riscv_vand_vx_u8m4(__riscv_vid_v_u8m4(vl8m4), 3, vl8m4), 2, vl8m4);
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::BIG>(
-      buf, len, utf16_output);
-}
+  for (size_t vl, vlOut; n > 0;) {
+    vl = __riscv_vsetvl_e16m2(n);
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  utf8_to_utf16_result ret =
-      icelake::valid_utf8_to_fixed_length<endianness::LITTLE, char16_t>(
-          buf, len, utf16_output);
-  size_t saved_bytes = ret.second - utf16_output;
-  const char *end = buf + len;
-  if (ret.first == end) {
-    return saved_bytes;
-  }
+    vuint16m2_t v = __riscv_vle16_v_u16m2((uint16_t const *)src, vl);
+    v = simdutf_byteflip<bflip>(v, vl);
+    vbool8_t m234 = __riscv_vmsgtu_vx_u16m2_b8(v, 0x80 - 1, vl);
 
-  // Note: AVX512 procedure looks up 4 bytes forward, and
-  //       correctly converts multi-byte chars even if their
-  //       continuation bytes lie outsiede 16-byte window.
-  //       It meas, we have to skip continuation bytes from
-  //       the beginning ret.first, as they were already consumed.
-  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
-    ret.first += 1;
-  }
+    if (__riscv_vfirst_m_b8(m234, vl) < 0) { /* 1 byte utf8 */
+      vlOut = vl;
+      __riscv_vse8_v_u8m1((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m1(v, vlOut),
+                          vlOut);
+      n -= vl, src += vl, dst += vlOut;
+      continue;
+    }
 
-  if (ret.first != end) {
-    const size_t scalar_saved_bytes =
-        scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
+    vbool8_t m34 = __riscv_vmsgtu_vx_u16m2_b8(v, 0x800 - 1, vl);
+
+    if (__riscv_vfirst_m_b8(m34, vl) < 0) { /* 1/2 byte utf8 */
+      /* 0: [     aaa|aabbbbbb]
+       * 1: [aabbbbbb|        ] vsll 8
+       * 2: [        |   aaaaa] vsrl 6
+       * 3: [00111111|00011111]
+       * 4: [  bbbbbb|000aaaaa] (1|2)&3
+       * 5: [11000000|11000000]
+       * 6: [10bbbbbb|110aaaaa] 4|5 */
+      vuint16m2_t twoByte = __riscv_vand_vx_u16m2(
+          __riscv_vor_vv_u16m2(__riscv_vsll_vx_u16m2(v, 8, vl),
+                               __riscv_vsrl_vx_u16m2(v, 6, vl), vl),
+          0b0011111100011111, vl);
+      vuint16m2_t vout16 =
+          __riscv_vor_vx_u16m2_mu(m234, v, twoByte, 0b1000000011000000, vl);
+      vuint8m2_t vout = __riscv_vreinterpret_v_u16m2_u8m2(vout16);
+
+      /* Every high byte that is zero should be compressed
+       * low bytes should never be compressed, so we set them
+       * to all ones, and then create a non-zero bytes mask */
+      vbool4_t mcomp =
+          __riscv_vmsne_vx_u8m2_b4(__riscv_vreinterpret_v_u16m2_u8m2(
+                                       __riscv_vor_vx_u16m2(vout16, 0xFF, vl)),
+                                   0, vl * 2);
+      vlOut = __riscv_vcpop_m_b4(mcomp, vl * 2);
+
+      vout = __riscv_vcompress_vm_u8m2(vout, mcomp, vl * 2);
+      __riscv_vse8_v_u8m2((uint8_t *)dst, vout, vlOut);
+
+      n -= vl, src += vl, dst += vlOut;
+      continue;
     }
-    saved_bytes += scalar_saved_bytes;
-  }
 
-  return saved_bytes;
-}
+    vbool8_t sur = __riscv_vmseq_vx_u16m2_b8(
+        __riscv_vand_vx_u16m2(v, 0xF800, vl), 0xD800, vl);
+    long first = __riscv_vfirst_m_b8(sur, vl);
+    size_t tail = vl - first;
+    vl = first < 0 ? vl : first;
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  utf8_to_utf16_result ret =
-      icelake::valid_utf8_to_fixed_length<endianness::BIG, char16_t>(
-          buf, len, utf16_output);
-  size_t saved_bytes = ret.second - utf16_output;
-  const char *end = buf + len;
-  if (ret.first == end) {
-    return saved_bytes;
-  }
+    if (vl > 0) { /* 1/2/3 byte utf8 */
+      /* in: [aaaabbbb|bbcccccc]
+       * v1: [0bcccccc|        ] vsll  8
+       * v1: [10cccccc|        ] vsll  8 & 0b00111111 | 0b10000000
+       * v2: [        |110bbbbb] vsrl  6 & 0b00111111 | 0b11000000
+       * v2: [        |10bbbbbb] vsrl  6 & 0b00111111 | 0b10000000
+       * v3: [        |1110aaaa] vsrl 12 | 0b11100000
+       *  1: [00000000|0bcccccc|00000000|00000000] => [0bcccccc]
+       *  2: [00000000|10cccccc|110bbbbb|00000000] => [110bbbbb] [10cccccc]
+       *  3: [00000000|10cccccc|10bbbbbb|1110aaaa] => [1110aaaa] [10bbbbbb]
+       * [10cccccc]
+       */
+      vuint16m2_t v1, v2, v3, v12;
+      v1 = __riscv_vor_vx_u16m2_mu(
+          m234, v, __riscv_vand_vx_u16m2(v, 0b00111111, vl), 0b10000000, vl);
+      v1 = __riscv_vsll_vx_u16m2(v1, 8, vl);
 
-  // Note: AVX512 procedure looks up 4 bytes forward, and
-  //       correctly converts multi-byte chars even if their
-  //       continuation bytes lie outsiede 16-byte window.
-  //       It meas, we have to skip continuation bytes from
-  //       the beginning ret.first, as they were already consumed.
-  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
-    ret.first += 1;
-  }
+      v2 = __riscv_vor_vx_u16m2(
+          __riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 6, vl), 0b00111111,
+                                vl),
+          0b10000000, vl);
+      v2 = __riscv_vor_vx_u16m2_mu(__riscv_vmnot_m_b8(m34, vl), v2, v2,
+                                   0b01000000, vl);
+      v3 = __riscv_vor_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 12, vl), 0b11100000,
+                                vl);
+      v12 = __riscv_vor_vv_u16m2_mu(m234, v1, v1, v2, vl);
 
-  if (ret.first != end) {
-    const size_t scalar_saved_bytes =
-        scalar::utf8_to_utf16::convert_valid<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
+      vuint32m4_t w12 = __riscv_vwmulu_vx_u32m4(v12, 1 << 8, vl);
+      vuint32m4_t w123 = __riscv_vwaddu_wv_u32m4_mu(m34, w12, w12, v3, vl);
+      vuint8m4_t vout = __riscv_vreinterpret_v_u32m4_u8m4(w123);
+
+      vbool2_t mcomp = __riscv_vmor_mm_b2(
+          m4mulp2, __riscv_vmsne_vx_u8m4_b2(vout, 0, vl * 4), vl * 4);
+      vlOut = __riscv_vcpop_m_b2(mcomp, vl * 4);
+
+      vout = __riscv_vcompress_vm_u8m4(vout, mcomp, vl * 4);
+      __riscv_vse8_v_u8m4((uint8_t *)dst, vout, vlOut);
+
+      n -= vl, src += vl, dst += vlOut;
     }
-    saved_bytes += scalar_saved_bytes;
+
+    if (tail)
+      while (n) {
+        uint16_t word = simdutf_byteflip<bflip>(src[0]);
+        if ((word & 0xFF80) == 0) {
+          break;
+        } else if ((word & 0xF800) == 0) {
+          break;
+        } else if ((word & 0xF800) != 0xD800) {
+          break;
+        } else {
+          // must be a surrogate pair
+          if (n <= 1)
+            return result(error_code::SURROGATE, src - srcBeg);
+          uint16_t diff = word - 0xD800;
+          if (diff > 0x3FF)
+            return result(error_code::SURROGATE, src - srcBeg);
+          uint16_t diff2 = simdutf_byteflip<bflip>(src[1]) - 0xDC00;
+          if (diff2 > 0x3FF)
+            return result(error_code::SURROGATE, src - srcBeg);
+
+          uint32_t value = ((diff + 0x40) << 10) + diff2;
+
+          // will generate four UTF-8 bytes
+          // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+          *dst++ = (char)((value >> 18) | 0b11110000);
+          *dst++ = (char)(((value >> 12) & 0b111111) | 0b10000000);
+          *dst++ = (char)(((value >> 6) & 0b111111) | 0b10000000);
+          *dst++ = (char)((value & 0b111111) | 0b10000000);
+          src += 2;
+          n -= 2;
+        }
+      }
   }
 
-  return saved_bytes;
+  return result(error_code::SUCCESS, dst - dstBeg);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
-    const char *buf, size_t len, char32_t *utf32_out) const noexcept {
-  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
-  utf8_to_utf32_result ret =
-      icelake::validating_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(
-          buf, len, utf32_output);
-  if (ret.second == nullptr)
-    return 0;
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  result res = convert_utf16le_to_utf8_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
+}
 
-  size_t saved_bytes = ret.second - utf32_output;
-  const char *end = buf + len;
-  if (ret.first == end) {
-    return saved_bytes;
-  }
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  result res = convert_utf16be_to_utf8_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
+}
 
-  // Note: the AVX512 procedure looks up 4 bytes forward, and
-  //       correctly converts multi-byte chars even if their
-  //       continuation bytes lie outside 16-byte window.
-  //       It means, we have to skip continuation bytes from
-  //       the beginning ret.first, as they were already consumed.
-  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
-    ret.first += 1;
-  }
-  if (ret.first != end) {
-    const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert(
-        ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  return rvv_utf16_to_utf8_with_errors<simdutf_ByteFlip::NONE>(src, len, dst);
+}
 
-  return saved_bytes;
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_utf16_to_utf8_with_errors<simdutf_ByteFlip::ZVBB>(src, len, dst);
+  else
+    return rvv_utf16_to_utf8_with_errors<simdutf_ByteFlip::V>(src, len, dst);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
-    const char *buf, size_t len, char32_t *utf32) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
-    return {error_code::SUCCESS, 0};
-  }
-  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32);
-  auto ret = icelake::validating_utf8_to_fixed_length_with_constant_checks<
-      endianness::LITTLE, uint32_t>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  return convert_utf16le_to_utf8(src, len, dst);
+}
 
-  if (!std::get<2>(ret)) {
-    size_t pos = std::get<0>(ret) - buf;
-    // We might have an error that occurs right before  pos.
-    // This is only a concern if buf[pos] is not a continuation byte.
-    if ((buf[pos] & 0xc0) != 0x80 && pos >= 64) {
-      pos -= 1;
-    } else if ((buf[pos] & 0xc0) == 0x80 && pos >= 64) {
-      // We must check whether we are the fourth continuation byte
-      bool c1 = (buf[pos - 1] & 0xc0) == 0x80;
-      bool c2 = (buf[pos - 2] & 0xc0) == 0x80;
-      bool c3 = (buf[pos - 3] & 0xc0) == 0x80;
-      if (c1 && c2 && c3) {
-        return {simdutf::TOO_LONG, pos};
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  return convert_utf16be_to_utf8(src, len, dst);
+}
+
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static result
+rvv_utf16_to_utf32_with_errors(const char16_t *src, size_t len, char32_t *dst) {
+  const char16_t *const srcBeg = src;
+  char32_t *const dstBeg = dst;
+
+  constexpr const uint16_t ANY_SURROGATE_MASK = 0xf800;
+  constexpr const uint16_t ANY_SURROGATE_VALUE = 0xd800;
+  constexpr const uint16_t LO_SURROGATE_MASK = 0xfc00;
+  constexpr const uint16_t LO_SURROGATE_VALUE = 0xdc00;
+  constexpr const uint16_t HI_SURROGATE_MASK = 0xfc00;
+  constexpr const uint16_t HI_SURROGATE_VALUE = 0xd800;
+
+  uint16_t last = 0;
+  while (len > 0) {
+    size_t vl = __riscv_vsetvl_e16m2(len);
+    vuint16m2_t v0 = __riscv_vle16_v_u16m2((uint16_t const *)src, vl);
+    v0 = simdutf_byteflip<bflip>(v0, vl);
+
+    { // check fast-path
+      const vuint16m2_t v = __riscv_vand_vx_u16m2(v0, ANY_SURROGATE_MASK, vl);
+      const vbool8_t any_surrogate =
+          __riscv_vmseq_vx_u16m2_b8(v, ANY_SURROGATE_VALUE, vl);
+      if (__riscv_vfirst_m_b8(any_surrogate, vl) < 0) {
+        /* no surrogates */
+        __riscv_vse32_v_u32m4((uint32_t *)dst, __riscv_vzext_vf2_u32m4(v0, vl),
+                              vl);
+        len -= vl;
+        src += vl;
+        dst += vl;
+        continue;
       }
     }
-    // todo: we reset the output to utf32 instead of using std::get<2.(ret) as
-    // you'd expect. that is because
-    // validating_utf8_to_fixed_length_with_constant_checks may have processed
-    // data beyond the error.
-    result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
-        pos, buf + pos, len - pos, utf32);
-    res.count += pos;
-    return res;
-  }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  const char *end = buf + len;
-  if (std::get<0>(ret) == end) {
-    return {simdutf::SUCCESS, saved_bytes};
-  }
 
-  // Note: the AVX512 procedure looks up 4 bytes forward, and
-  //       correctly converts multi-byte chars even if their
-  //       continuation bytes lie outside 16-byte window.
-  //       It means, we have to skip continuation bytes from
-  //       the beginning ret.first, as they were already consumed.
-  while (std::get<0>(ret) != end and
-         ((uint8_t(*std::get<0>(ret)) & 0xc0) == 0x80)) {
-    std::get<0>(ret) += 1;
-  }
+    if ((simdutf_byteflip<bflip>(src[0]) & LO_SURROGATE_MASK) ==
+        LO_SURROGATE_VALUE) {
+      return result(error_code::SURROGATE, src - srcBeg);
+    }
+
+    // decode surrogates
+    vuint16m2_t v1 = __riscv_vslide1down_vx_u16m2(v0, 0, vl);
+    vl = __riscv_vsetvl_e16m2(vl - 1);
+    if (vl == 0) {
+      return result(error_code::SURROGATE, src - srcBeg);
+    }
+
+    const vbool8_t surhi = __riscv_vmseq_vx_u16m2_b8(
+        __riscv_vand_vx_u16m2(v0, HI_SURROGATE_MASK, vl), HI_SURROGATE_VALUE,
+        vl);
+    const vbool8_t surlo = __riscv_vmseq_vx_u16m2_b8(
+        __riscv_vand_vx_u16m2(v1, LO_SURROGATE_MASK, vl), LO_SURROGATE_VALUE,
+        vl);
+
+    // compress everything but lo surrogates
+    const vbool8_t compress = __riscv_vmsne_vx_u16m2_b8(
+        __riscv_vand_vx_u16m2(v0, LO_SURROGATE_MASK, vl), LO_SURROGATE_VALUE,
+        vl);
 
-  if (std::get<0>(ret) != end) {
-    auto scalar_result = scalar::utf8_to_utf32::convert_with_errors(
-        std::get<0>(ret), len - (std::get<0>(ret) - buf),
-        reinterpret_cast<char32_t *>(utf32_output) + saved_bytes);
-    if (scalar_result.error != simdutf::SUCCESS) {
-      scalar_result.count += (std::get<0>(ret) - buf);
-    } else {
-      scalar_result.count += saved_bytes;
+    {
+      const vbool8_t diff = __riscv_vmxor_mm_b8(surhi, surlo, vl);
+      const long idx = __riscv_vfirst_m_b8(diff, vl);
+      if (idx >= 0) {
+        uint16_t word = simdutf_byteflip<bflip>(src[idx]);
+        if (word < 0xD800 || word > 0xDBFF) {
+          return result(error_code::SURROGATE, src - srcBeg + idx + 1);
+        }
+        return result(error_code::SURROGATE, src - srcBeg + idx);
+      }
     }
-    return scalar_result;
-  }
 
-  return {simdutf::SUCCESS, size_t(std::get<1>(ret) - utf32_output)};
-}
+    last = simdutf_byteflip<bflip>(src[vl]);
+    vuint32m4_t utf32 = __riscv_vzext_vf2_u32m4(v0, vl);
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
-    const char *buf, size_t len, char32_t *utf32_out) const noexcept {
-  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
-  utf8_to_utf32_result ret =
-      icelake::valid_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(
-          buf, len, utf32_output);
-  size_t saved_bytes = ret.second - utf32_output;
-  const char *end = buf + len;
-  if (ret.first == end) {
-    return saved_bytes;
-  }
+    // v0 = 110110yyyyyyyyyy (0xd800 + yyyyyyyyyy) --- hi surrogate
+    // v1 = 110111xxxxxxxxxx (0xdc00 + xxxxxxxxxx) --- lo surrogate
 
-  // Note: AVX512 procedure looks up 4 bytes forward, and
-  //       correctly converts multi-byte chars even if their
-  //       continuation bytes lie outsiede 16-byte window.
-  //       It meas, we have to skip continuation bytes from
-  //       the beginning ret.first, as they were already consumed.
-  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
-    ret.first += 1;
-  }
+    // t0 = u16(                    0000_00yy_yyyy_yyyy)
+    const vuint32m4_t t0 =
+        __riscv_vzext_vf2_u32m4(__riscv_vand_vx_u16m2(v0, 0x03ff, vl), vl);
+    // t1 = u32(0000_0000_0000_yyyy_yyyy_yy00_0000_0000)
+    const vuint32m4_t t1 = __riscv_vsll_vx_u32m4(t0, 10, vl);
 
-  if (ret.first != end) {
-    const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert_valid(
-        ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
+    // t2 = u32(0000_0000_0000_0000_0000_00xx_xxxx_xxxx)
+    const vuint32m4_t t2 =
+        __riscv_vzext_vf2_u32m4(__riscv_vand_vx_u16m2(v1, 0x03ff, vl), vl);
 
-  return saved_bytes;
-}
+    // t3 = u32(0000_0000_0000_yyyy_yyyy_yyxx_xxxx_xxxx)
+    const vuint32m4_t t3 = __riscv_vor_vv_u32m4(t1, t2, vl);
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  return icelake_convert_utf16_to_latin1<endianness::LITTLE>(buf, len,
-                                                             latin1_output);
-}
+    // t4 = utf32 from surrogate pairs
+    const vuint32m4_t t4 = __riscv_vadd_vx_u32m4(t3, 0x10000, vl);
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  return icelake_convert_utf16_to_latin1<endianness::BIG>(buf, len,
-                                                          latin1_output);
-}
+    const vuint32m4_t result = __riscv_vmerge_vvm_u32m4(utf32, t4, surhi, vl);
 
-simdutf_warn_unused result
-implementation::convert_utf16le_to_latin1_with_errors(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  return icelake_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
-             buf, len, latin1_output)
-      .first;
-}
+    const vuint32m4_t comp = __riscv_vcompress_vm_u32m4(result, compress, vl);
+    const size_t vlOut = __riscv_vcpop_m_b8(compress, vl);
+    __riscv_vse32_v_u32m4((uint32_t *)dst, comp, vlOut);
 
-simdutf_warn_unused result
-implementation::convert_utf16be_to_latin1_with_errors(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  return icelake_convert_utf16_to_latin1_with_errors<endianness::BIG>(
-             buf, len, latin1_output)
-      .first;
-}
+    len -= vl;
+    src += vl;
+    dst += vlOut;
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  // optimization opportunity: implement custom function
-  return convert_utf16be_to_latin1(buf, len, latin1_output);
-}
+    if ((last & LO_SURROGATE_MASK) == LO_SURROGATE_VALUE) {
+      // last item is lo surrogate and got already consumed
+      len -= 1;
+      src += 1;
+    }
+  }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  // optimization opportunity: implement custom function
-  return convert_utf16le_to_latin1(buf, len, latin1_output);
+  return result(error_code::SUCCESS, dst - dstBeg);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  size_t outlen;
-  size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(
-      buf, len, (unsigned char *)utf8_output, &outlen);
-  if (inlen != len) {
-    return 0;
-  }
-  return outlen;
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+    const char16_t *src, size_t len, char32_t *dst) const noexcept {
+  result res = convert_utf16le_to_utf32_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  size_t outlen;
-  size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(
-      buf, len, (unsigned char *)utf8_output, &outlen);
-  if (inlen != len) {
-    return 0;
-  }
-  return outlen;
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+    const char16_t *src, size_t len, char32_t *dst) const noexcept {
+  result res = convert_utf16be_to_utf32_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  size_t outlen;
-  size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(
-      buf, len, (unsigned char *)utf8_output, &outlen);
-  if (inlen != len) {
-    result res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
-        buf + inlen, len - inlen, utf8_output + outlen);
-    res.count += inlen;
-    return res;
-  }
-  return {simdutf::SUCCESS, outlen};
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+    const char16_t *src, size_t len, char32_t *dst) const noexcept {
+  return rvv_utf16_to_utf32_with_errors<simdutf_ByteFlip::NONE>(src, len, dst);
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  size_t outlen;
-  size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(
-      buf, len, (unsigned char *)utf8_output, &outlen);
-  if (inlen != len) {
-    result res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
-        buf + inlen, len - inlen, utf8_output + outlen);
-    res.count += inlen;
-    return res;
-  }
-  return {simdutf::SUCCESS, outlen};
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+    const char16_t *src, size_t len, char32_t *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_utf16_to_utf32_with_errors<simdutf_ByteFlip::ZVBB>(src, len,
+                                                                  dst);
+  else
+    return rvv_utf16_to_utf32_with_errors<simdutf_ByteFlip::V>(src, len, dst);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return convert_utf16le_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+    const char16_t *src, size_t len, char32_t *dst) const noexcept {
+  return convert_utf16le_to_utf32(src, len, dst);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return convert_utf16be_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+    const char16_t *src, size_t len, char32_t *dst) const noexcept {
+  return convert_utf16be_to_utf32(src, len, dst);
 }
+/* end file src/rvv/rvv_utf16_to.inl.cpp */
+/* begin file src/rvv/rvv_utf32_to.inl.cpp */
 
 simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  return icelake_convert_utf32_to_latin1(buf, len, latin1_output);
+    const char32_t *src, size_t len, char *dst) const noexcept {
+  result res = convert_utf32_to_latin1_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
 }
 
 simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  return icelake_convert_utf32_to_latin1_with_errors(buf, len, latin1_output)
-      .first;
+    const char32_t *src, size_t len, char *dst) const noexcept {
+  const char32_t *const beg = src;
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e32m8(len);
+    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
+    long idx = __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 255, vl), vl);
+    if (idx >= 0)
+      return result(error_code::TOO_LARGE, src - beg + idx);
+    /* We don't use vcompress here, because its performance varies widely on
+     * current platforms. This might be worth reconsidering once there is more
+     * hardware available. */
+    __riscv_vse8_v_u8m2(
+        (uint8_t *)dst,
+        __riscv_vncvt_x_x_w_u8m2(__riscv_vncvt_x_x_w_u16m4(v, vl), vl), vl);
+  }
+  return result(error_code::SUCCESS, src - beg);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  return icelake_convert_utf32_to_latin1(buf, len, latin1_output);
+    const char32_t *src, size_t len, char *dst) const noexcept {
+  return convert_utf32_to_latin1(src, len, dst);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  std::pair<const char32_t *, char *> ret =
-      avx512_convert_utf32_to_utf8(buf, len, utf8_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
-        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+    const char32_t *src, size_t len, char *dst) const noexcept {
+  size_t n = len;
+  const char32_t *srcBeg = src;
+  const char *dstBeg = dst;
+  size_t vl8m4 = __riscv_vsetvlmax_e8m4();
+  vbool2_t m4mulp2 = __riscv_vmseq_vx_u8m4_b2(
+      __riscv_vand_vx_u8m4(__riscv_vid_v_u8m4(vl8m4), 3, vl8m4), 2, vl8m4);
+
+  for (size_t vl, vlOut; n > 0;) {
+    vl = __riscv_vsetvl_e32m4(n);
+
+    vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t const *)src, vl);
+    vbool8_t m234 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x80 - 1, vl);
+    vuint16m2_t vn = __riscv_vncvt_x_x_w_u16m2(v, vl);
+
+    if (__riscv_vfirst_m_b8(m234, vl) < 0) { /* 1 byte utf8 */
+      vlOut = vl;
+      __riscv_vse8_v_u8m1((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m1(vn, vlOut),
+                          vlOut);
+      n -= vl, src += vl, dst += vlOut;
+      continue;
     }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of
-  // code units written even if finished
-  std::pair<result, char *> ret =
-      icelake::avx512_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
-        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
+    vbool8_t m34 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x800 - 1, vl);
+
+    if (__riscv_vfirst_m_b8(m34, vl) < 0) { /* 1/2 byte utf8 */
+      /* 0: [     aaa|aabbbbbb]
+       * 1: [aabbbbbb|        ] vsll 8
+       * 2: [        |   aaaaa] vsrl 6
+       * 3: [00111111|00111111]
+       * 4: [  bbbbbb|000aaaaa] (1|2)&3
+       * 5: [10000000|11000000]
+       * 6: [10bbbbbb|110aaaaa] 4|5 */
+      vuint16m2_t twoByte = __riscv_vand_vx_u16m2(
+          __riscv_vor_vv_u16m2(__riscv_vsll_vx_u16m2(vn, 8, vl),
+                               __riscv_vsrl_vx_u16m2(vn, 6, vl), vl),
+          0b0011111100111111, vl);
+      vuint16m2_t vout16 =
+          __riscv_vor_vx_u16m2_mu(m234, vn, twoByte, 0b1000000011000000, vl);
+      vuint8m2_t vout = __riscv_vreinterpret_v_u16m2_u8m2(vout16);
+
+      /* Every high byte that is zero should be compressed
+       * low bytes should never be compressed, so we set them
+       * to all ones, and then create a non-zero bytes mask */
+      vbool4_t mcomp =
+          __riscv_vmsne_vx_u8m2_b4(__riscv_vreinterpret_v_u16m2_u8m2(
+                                       __riscv_vor_vx_u16m2(vout16, 0xFF, vl)),
+                                   0, vl * 2);
+      vlOut = __riscv_vcpop_m_b4(mcomp, vl * 2);
+
+      vout = __riscv_vcompress_vm_u8m2(vout, mcomp, vl * 2);
+      __riscv_vse8_v_u8m2((uint8_t *)dst, vout, vlOut);
+
+      n -= vl, src += vl, dst += vlOut;
+      continue;
+    }
+    long idx1 =
+        __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl);
+    vbool8_t sur = __riscv_vmseq_vx_u32m4_b8(
+        __riscv_vand_vx_u32m4(v, 0xFFFFF800, vl), 0xD800, vl);
+    long idx2 = __riscv_vfirst_m_b8(sur, vl);
+    if (idx1 >= 0 && idx2 >= 0) {
+      if (idx1 <= idx2) {
+        return result(error_code::TOO_LARGE, src - srcBeg + idx1);
+      } else {
+        return result(error_code::SURROGATE, src - srcBeg + idx2);
+      }
+    }
+    if (idx1 >= 0) {
+      return result(error_code::TOO_LARGE, src - srcBeg + idx1);
+    }
+    if (idx2 >= 0) {
+      return result(error_code::SURROGATE, src - srcBeg + idx2);
     }
-  }
-  ret.first.count =
-      ret.second -
-      utf8_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  return convert_utf32_to_utf8(buf, len, utf8_output);
-}
+    vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x10000 - 1, vl);
+    long first = __riscv_vfirst_m_b8(m4, vl);
+    size_t tail = vl - first;
+    vl = first < 0 ? vl : first;
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  std::pair<const char32_t *, char16_t *> ret =
-      avx512_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf32_to_utf16::convert<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
+    if (vl > 0) { /* 1/2/3 byte utf8 */
+      /* vn: [aaaabbbb|bbcccccc]
+       * v1: [0bcccccc|        ] vsll  8
+       * v1: [10cccccc|        ] vsll  8 & 0b00111111 | 0b10000000
+       * v2: [        |110bbbbb] vsrl  6 & 0b00111111 | 0b11000000
+       * v2: [        |10bbbbbb] vsrl  6 & 0b00111111 | 0b10000000
+       * v3: [        |1110aaaa] vsrl 12 | 0b11100000
+       *  1: [00000000|0bcccccc|00000000|00000000] => [0bcccccc]
+       *  2: [00000000|10cccccc|110bbbbb|00000000] => [110bbbbb] [10cccccc]
+       *  3: [00000000|10cccccc|10bbbbbb|1110aaaa] => [1110aaaa] [10bbbbbb]
+       * [10cccccc]
+       */
+      vuint16m2_t v1, v2, v3, v12;
+      v1 = __riscv_vor_vx_u16m2_mu(
+          m234, vn, __riscv_vand_vx_u16m2(vn, 0b00111111, vl), 0b10000000, vl);
+      v1 = __riscv_vsll_vx_u16m2(v1, 8, vl);
+
+      v2 = __riscv_vor_vx_u16m2(
+          __riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(vn, 6, vl), 0b00111111,
+                                vl),
+          0b10000000, vl);
+      v2 = __riscv_vor_vx_u16m2_mu(__riscv_vmnot_m_b8(m34, vl), v2, v2,
+                                   0b01000000, vl);
+      v3 = __riscv_vor_vx_u16m2(__riscv_vsrl_vx_u16m2(vn, 12, vl), 0b11100000,
+                                vl);
+      v12 = __riscv_vor_vv_u16m2_mu(m234, v1, v1, v2, vl);
+
+      vuint32m4_t w12 = __riscv_vwmulu_vx_u32m4(v12, 1 << 8, vl);
+      vuint32m4_t w123 = __riscv_vwaddu_wv_u32m4_mu(m34, w12, w12, v3, vl);
+      vuint8m4_t vout = __riscv_vreinterpret_v_u32m4_u8m4(w123);
+
+      vbool2_t mcomp = __riscv_vmor_mm_b2(
+          m4mulp2, __riscv_vmsne_vx_u8m4_b2(vout, 0, vl * 4), vl * 4);
+      vlOut = __riscv_vcpop_m_b2(mcomp, vl * 4);
+
+      vout = __riscv_vcompress_vm_u8m4(vout, mcomp, vl * 4);
+      __riscv_vse8_v_u8m4((uint8_t *)dst, vout, vlOut);
+
+      n -= vl, src += vl, dst += vlOut;
     }
-    saved_bytes += scalar_saved_bytes;
+
+    if (tail)
+      while (n) {
+        uint32_t word = src[0];
+        if (word < 0x10000)
+          break;
+        if (word > 0x10FFFF)
+          return result(error_code::TOO_LARGE, src - srcBeg);
+        *dst++ = (uint8_t)((word >> 18) | 0b11110000);
+        *dst++ = (uint8_t)(((word >> 12) & 0b111111) | 0b10000000);
+        *dst++ = (uint8_t)(((word >> 6) & 0b111111) | 0b10000000);
+        *dst++ = (uint8_t)((word & 0b111111) | 0b10000000);
+        ++src;
+        --n;
+      }
   }
-  return saved_bytes;
+
+  return result(error_code::SUCCESS, dst - dstBeg);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  std::pair<const char32_t *, char16_t *> ret =
-      avx512_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf32_to_utf16::convert<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+    const char32_t *src, size_t len, char *dst) const noexcept {
+  result res = convert_utf32_to_utf8_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of
-  // code units written even if finished
-  std::pair<result, char16_t *> ret =
-      avx512_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(
-          buf, len, utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res =
-        scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count =
-      ret.second -
-      utf16_output; // Set count to the number of 8-bit code units written
-  return ret.first;
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+    const char32_t *src, size_t len, char *dst) const noexcept {
+  return convert_utf32_to_utf8(src, len, dst);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of
-  // code units written even if finished
-  std::pair<result, char16_t *> ret =
-      avx512_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len,
-                                                                 utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res =
-        scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static result
+rvv_convert_utf32_to_utf16_with_errors(const char32_t *src, size_t len,
+                                       char16_t *dst) {
+  size_t vl8m2 = __riscv_vsetvlmax_e8m2();
+  vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(
+      __riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);
+  const char16_t *dstBeg = dst;
+  const char32_t *srcBeg = src;
+  for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) {
+    vl = __riscv_vsetvl_e32m4(len);
+    vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t *)src, vl);
+    vuint32m4_t off = __riscv_vadd_vx_u32m4(v, 0xFFFF2000, vl);
+    long idx1 =
+        __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl);
+    long idx2 = __riscv_vfirst_m_b8(
+        __riscv_vmsgtu_vx_u32m4_b8(off, 0xFFFFF7FF, vl), vl);
+    if (idx1 >= 0 && idx2 >= 0) {
+      if (idx1 <= idx2)
+        return result(error_code::TOO_LARGE, src - srcBeg + idx1);
+      return result(error_code::SURROGATE, src - srcBeg + idx2);
+    }
+    if (idx1 >= 0)
+      return result(error_code::TOO_LARGE, src - srcBeg + idx1);
+    if (idx2 >= 0)
+      return result(error_code::SURROGATE, src - srcBeg + idx2);
+    long idx =
+        __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0xFFFF, vl), vl);
+    if (idx < 0) {
+      vlOut = vl;
+      vuint16m2_t n =
+          simdutf_byteflip<bflip>(__riscv_vncvt_x_x_w_u16m2(v, vlOut), vlOut);
+      __riscv_vse16_v_u16m2((uint16_t *)dst, n, vlOut);
+      continue;
     }
+    vlOut = rvv_utf32_store_utf16_m4<bflip>((uint16_t *)dst, v, vl, m4even);
   }
-  ret.first.count =
-      ret.second -
-      utf16_output; // Set count to the number of 8-bit code units written
-  return ret.first;
+  return result(error_code::SUCCESS, dst - dstBeg);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return convert_utf32_to_utf16le(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+    const char32_t *src, size_t len, char16_t *dst) const noexcept {
+  result res = convert_utf32_to_utf16le_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return convert_utf32_to_utf16be(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+    const char32_t *src, size_t len, char16_t *dst) const noexcept {
+  result res = convert_utf32_to_utf16be_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  std::tuple<const char16_t *, char32_t *, bool> ret =
-      icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
-                                                          utf32_output);
-  if (!std::get<2>(ret)) {
-    return 0;
-  }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf16_to_utf32::convert<endianness::LITTLE>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+    const char32_t *src, size_t len, char16_t *dst) const noexcept {
+  return rvv_convert_utf32_to_utf16_with_errors<simdutf_ByteFlip::NONE>(
+      src, len, dst);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  std::tuple<const char16_t *, char32_t *, bool> ret =
-      icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-  if (!std::get<2>(ret)) {
-    return 0;
-  }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf16_to_utf32::convert<endianness::BIG>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+    const char32_t *src, size_t len, char16_t *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_convert_utf32_to_utf16_with_errors<simdutf_ByteFlip::ZVBB>(
+        src, len, dst);
+  else
+    return rvv_convert_utf32_to_utf16_with_errors<simdutf_ByteFlip::V>(src, len,
+                                                                       dst);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  std::tuple<const char16_t *, char32_t *, bool> ret =
-      icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
-                                                          utf32_output);
-  if (!std::get<2>(ret)) {
-    result scalar_res =
-        scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    scalar_res.count += (std::get<0>(ret) - buf);
-    return scalar_res;
-  }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    result scalar_res =
-        scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_res.error) {
-      scalar_res.count += (std::get<0>(ret) - buf);
-      return scalar_res;
-    } else {
-      scalar_res.count += saved_bytes;
-      return scalar_res;
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static size_t
+rvv_convert_valid_utf32_to_utf16(const char32_t *src, size_t len,
+                                 char16_t *dst) {
+  size_t vl8m2 = __riscv_vsetvlmax_e8m2();
+  vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(
+      __riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);
+  char16_t *dstBeg = dst;
+  for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) {
+    vl = __riscv_vsetvl_e32m4(len);
+    vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t *)src, vl);
+    if (__riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0xFFFF, vl), vl) <
+        0) {
+      vlOut = vl;
+      vuint16m2_t n =
+          simdutf_byteflip<bflip>(__riscv_vncvt_x_x_w_u16m2(v, vlOut), vlOut);
+      __riscv_vse16_v_u16m2((uint16_t *)dst, n, vlOut);
+      continue;
     }
+    vlOut = rvv_utf32_store_utf16_m4<bflip>((uint16_t *)dst, v, vl, m4even);
   }
-  return simdutf::result(simdutf::SUCCESS, saved_bytes);
+  return dst - dstBeg;
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  std::tuple<const char16_t *, char32_t *, bool> ret =
-      icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-  if (!std::get<2>(ret)) {
-    result scalar_res =
-        scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    scalar_res.count += (std::get<0>(ret) - buf);
-    return scalar_res;
-  }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    result scalar_res =
-        scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_res.error) {
-      scalar_res.count += (std::get<0>(ret) - buf);
-      return scalar_res;
-    } else {
-      scalar_res.count += saved_bytes;
-      return scalar_res;
-    }
-  }
-  return simdutf::result(simdutf::SUCCESS, saved_bytes);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+    const char32_t *src, size_t len, char16_t *dst) const noexcept {
+  return rvv_convert_valid_utf32_to_utf16<simdutf_ByteFlip::NONE>(src, len,
+                                                                  dst);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  std::tuple<const char16_t *, char32_t *, bool> ret =
-      icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
-                                                          utf32_output);
-  if (!std::get<2>(ret)) {
-    return 0;
-  }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf16_to_utf32::convert<endianness::LITTLE>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+    const char32_t *src, size_t len, char16_t *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_convert_valid_utf32_to_utf16<simdutf_ByteFlip::ZVBB>(src, len,
+                                                                    dst);
+  else
+    return rvv_convert_valid_utf32_to_utf16<simdutf_ByteFlip::V>(src, len, dst);
 }
+/* end file src/rvv/rvv_utf32_to.inl.cpp */
+/* begin file src/rvv/rvv_utf8_to.inl.cpp */
+template <typename Tdst, simdutf_ByteFlip bflip, bool validate = true>
+simdutf_really_inline static size_t rvv_utf8_to_common(char const *src,
+                                                       size_t len, Tdst *dst) {
+  static_assert(std::is_same<Tdst, uint16_t>() ||
+                    std::is_same<Tdst, uint32_t>(),
+                "invalid type");
+  constexpr bool is16 = std::is_same<Tdst, uint16_t>();
+  constexpr endianness endian =
+      bflip == simdutf_ByteFlip::NONE ? endianness::LITTLE : endianness::BIG;
+  const auto scalar = [](char const *in, size_t count, Tdst *out) {
+    return is16 ? scalar::utf8_to_utf16::convert<endian>(in, count,
+                                                         (char16_t *)out)
+                : scalar::utf8_to_utf32::convert(in, count, (char32_t *)out);
+  };
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  std::tuple<const char16_t *, char32_t *, bool> ret =
-      icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-  if (!std::get<2>(ret)) {
-    return 0;
-  }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf16_to_utf32::convert<endianness::BIG>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_saved_bytes == 0) {
+  if (len < 32)
+    return scalar(src, len, dst);
+
+  /* validate first three bytes */
+  if (validate) {
+    size_t idx = 3;
+    while (idx < len && (src[idx] >> 6) == 0b10)
+      ++idx;
+    if (idx > 3 + 3 || !scalar::utf8::validate(src, idx))
       return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
   }
-  return saved_bytes;
-}
 
-void implementation::change_endianness_utf16(const char16_t *input,
-                                             size_t length,
-                                             char16_t *output) const noexcept {
-  size_t pos = 0;
-  const __m512i byteflip = _mm512_setr_epi64(
-      0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
-      0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
-      0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  while (pos + 32 <= length) {
-    __m512i utf16 = _mm512_loadu_si512((const __m512i *)(input + pos));
-    utf16 = _mm512_shuffle_epi8(utf16, byteflip);
-    _mm512_storeu_si512(output + pos, utf16);
-    pos += 32;
-  }
-  if (pos < length) {
-    __mmask32 m((1U << (length - pos)) - 1);
-    __m512i utf16 = _mm512_maskz_loadu_epi16(m, (const __m512i *)(input + pos));
-    utf16 = _mm512_shuffle_epi8(utf16, byteflip);
-    _mm512_mask_storeu_epi16(output + pos, m, utf16);
-  }
-}
+  size_t tail = 3;
+  size_t n = len - tail;
+  Tdst *beg = dst;
 
-simdutf_warn_unused size_t implementation::count_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  const char16_t *ptr = input;
-  size_t count{0};
+  static const uint64_t err1m[] = {0x0202020202020202, 0x4915012180808080};
+  static const uint64_t err2m[] = {0xCBCBCB8B8383A3E7, 0xCBCBDBCBCBCBCBCB};
+  static const uint64_t err3m[] = {0x0101010101010101, 0X01010101BABAAEE6};
 
-  if (length >= 32) {
-    const char16_t *end = input + length - 32;
+  const vuint8m1_t err1tbl =
+      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2));
+  const vuint8m1_t err2tbl =
+      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2));
+  const vuint8m1_t err3tbl =
+      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2));
 
-    const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
-    const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
+  size_t vl8m2 = __riscv_vsetvlmax_e8m2();
+  vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(
+      __riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);
 
-    while (ptr <= end) {
-      __m512i utf16 = _mm512_loadu_si512((const __m512i *)ptr);
-      ptr += 32;
-      uint64_t not_high_surrogate =
-          static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) |
-                                _mm512_cmplt_epu16_mask(utf16, low));
-      count += count_ones(not_high_surrogate);
+  for (size_t vl, vlOut; n > 0; n -= vl, src += vl, dst += vlOut) {
+    vl = __riscv_vsetvl_e8m2(n);
+
+    vuint8m2_t v0 = __riscv_vle8_v_u8m2((uint8_t const *)src, vl);
+    uint64_t max = __riscv_vmv_x_s_u8m1_u8(
+        __riscv_vredmaxu_vs_u8m2_u8m1(v0, __riscv_vmv_s_x_u8m1(0, vl), vl));
+
+    uint8_t next0 = src[vl + 0];
+    uint8_t next1 = src[vl + 1];
+    uint8_t next2 = src[vl + 2];
+
+    /* fast path: ASCII */
+    if ((max | next0 | next1 | next2) < 0b10000000) {
+      vlOut = vl;
+      if (is16)
+        __riscv_vse16_v_u16m4(
+            (uint16_t *)dst,
+            simdutf_byteflip<bflip>(__riscv_vzext_vf2_u16m4(v0, vlOut), vlOut),
+            vlOut);
+      else
+        __riscv_vse32_v_u32m8((uint32_t *)dst,
+                              __riscv_vzext_vf4_u32m8(v0, vlOut), vlOut);
+      continue;
     }
-  }
 
-  return count + scalar::utf16::count_code_points<endianness::LITTLE>(
-                     ptr, length - (ptr - input));
-}
+    /* see "Validating UTF-8 In Less Than One Instruction Per Byte"
+     * https://arxiv.org/abs/2010.03090 */
+    vuint8m2_t v1 = __riscv_vslide1down_vx_u8m2(v0, next0, vl);
+    vuint8m2_t v2 = __riscv_vslide1down_vx_u8m2(v1, next1, vl);
+    vuint8m2_t v3 = __riscv_vslide1down_vx_u8m2(v2, next2, vl);
 
-simdutf_warn_unused size_t implementation::count_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  const char16_t *ptr = input;
-  size_t count{0};
-  if (length >= 32) {
+    if (validate) {
+      vuint8m2_t s1 = __riscv_vreinterpret_v_u16m2_u8m2(__riscv_vsrl_vx_u16m2(
+          __riscv_vreinterpret_v_u8m2_u16m2(v2), 4, __riscv_vsetvlmax_e16m2()));
+      vuint8m2_t s3 = __riscv_vreinterpret_v_u16m2_u8m2(__riscv_vsrl_vx_u16m2(
+          __riscv_vreinterpret_v_u8m2_u16m2(v3), 4, __riscv_vsetvlmax_e16m2()));
 
-    const char16_t *end = input + length - 32;
+      vuint8m2_t idx2 = __riscv_vand_vx_u8m2(v2, 0xF, vl);
+      vuint8m2_t idx1 = __riscv_vand_vx_u8m2(s1, 0xF, vl);
+      vuint8m2_t idx3 = __riscv_vand_vx_u8m2(s3, 0xF, vl);
 
-    const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
-    const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
+      vuint8m2_t err1 = simdutf_vrgather_u8m1x2(err1tbl, idx1);
+      vuint8m2_t err2 = simdutf_vrgather_u8m1x2(err2tbl, idx2);
+      vuint8m2_t err3 = simdutf_vrgather_u8m1x2(err3tbl, idx3);
+      vint8m2_t errs = __riscv_vreinterpret_v_u8m2_i8m2(
+          __riscv_vand_vv_u8m2(__riscv_vand_vv_u8m2(err1, err2, vl), err3, vl));
 
-    const __m512i byteflip = _mm512_setr_epi64(
-        0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
-        0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
-        0x0607040502030001, 0x0e0f0c0d0a0b0809);
-    while (ptr <= end) {
-      __m512i utf16 =
-          _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i *)ptr), byteflip);
-      ptr += 32;
-      uint64_t not_high_surrogate =
-          static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) |
-                                _mm512_cmplt_epu16_mask(utf16, low));
-      count += count_ones(not_high_surrogate);
+      vbool4_t is_3 = __riscv_vmsgtu_vx_u8m2_b4(v1, 0b11100000 - 1, vl);
+      vbool4_t is_4 = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b11110000 - 1, vl);
+      vbool4_t is_34 = __riscv_vmor_mm_b4(is_3, is_4, vl);
+      vbool4_t err34 =
+          __riscv_vmxor_mm_b4(is_34, __riscv_vmslt_vx_i8m2_b4(errs, 0, vl), vl);
+      vbool4_t errm =
+          __riscv_vmor_mm_b4(__riscv_vmsgt_vx_i8m2_b4(errs, 0, vl), err34, vl);
+      if (__riscv_vfirst_m_b4(errm, vl) >= 0)
+        return 0;
     }
-  }
 
-  return count + scalar::utf16::count_code_points<endianness::BIG>(
-                     ptr, length - (ptr - input));
-}
+    /* decoding */
 
-simdutf_warn_unused size_t
-implementation::count_utf8(const char *input, size_t length) const noexcept {
-  const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
-  size_t answer =
-      length / sizeof(__m512i) *
-      sizeof(__m512i); // Number of 512-bit chunks that fits into the length.
-  size_t i = 0;
-  __m512i unrolled_popcount{0};
+    /* mask of non continuation bytes */
+    vbool4_t m =
+        __riscv_vmsgt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v0), -65, vl);
+    vlOut = __riscv_vcpop_m_b4(m, vl);
 
-  const __m512i continuation = _mm512_set1_epi8(char(0b10111111));
+    /* extract first and second bytes */
+    vuint8m2_t b1 = __riscv_vcompress_vm_u8m2(v0, m, vl);
+    vuint8m2_t b2 = __riscv_vcompress_vm_u8m2(v1, m, vl);
 
-  while (i + sizeof(__m512i) <= length) {
-    size_t iterations = (length - i) / sizeof(__m512i);
+    /* fast path: one and two byte */
+    if (max < 0b11100000) {
+      b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut);
 
-    size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i);
-    for (; i + 8 * sizeof(__m512i) <= max_i; i += 8 * sizeof(__m512i)) {
-      __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i));
-      __m512i input2 =
-          _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i)));
-      __m512i input3 =
-          _mm512_loadu_si512((const __m512i *)(str + i + 2 * sizeof(__m512i)));
-      __m512i input4 =
-          _mm512_loadu_si512((const __m512i *)(str + i + 3 * sizeof(__m512i)));
-      __m512i input5 =
-          _mm512_loadu_si512((const __m512i *)(str + i + 4 * sizeof(__m512i)));
-      __m512i input6 =
-          _mm512_loadu_si512((const __m512i *)(str + i + 5 * sizeof(__m512i)));
-      __m512i input7 =
-          _mm512_loadu_si512((const __m512i *)(str + i + 6 * sizeof(__m512i)));
-      __m512i input8 =
-          _mm512_loadu_si512((const __m512i *)(str + i + 7 * sizeof(__m512i)));
+      vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut);
+      b1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut);
 
-      __mmask64 mask1 = _mm512_cmple_epi8_mask(input1, continuation);
-      __mmask64 mask2 = _mm512_cmple_epi8_mask(input2, continuation);
-      __mmask64 mask3 = _mm512_cmple_epi8_mask(input3, continuation);
-      __mmask64 mask4 = _mm512_cmple_epi8_mask(input4, continuation);
-      __mmask64 mask5 = _mm512_cmple_epi8_mask(input5, continuation);
-      __mmask64 mask6 = _mm512_cmple_epi8_mask(input6, continuation);
-      __mmask64 mask7 = _mm512_cmple_epi8_mask(input7, continuation);
-      __mmask64 mask8 = _mm512_cmple_epi8_mask(input8, continuation);
+      vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4(
+          b1,
+          __riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1 << 6, m1,
+                                  vlOut),
+          vlOut);
+      b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut);
+      if (is16)
+        __riscv_vse16_v_u16m4((uint16_t *)dst,
+                              simdutf_byteflip<bflip>(b12, vlOut), vlOut);
+      else
+        __riscv_vse32_v_u32m8((uint32_t *)dst,
+                              __riscv_vzext_vf2_u32m8(b12, vlOut), vlOut);
+      continue;
+    }
 
-      __m512i mask_register = _mm512_set_epi64(mask8, mask7, mask6, mask5,
-                                               mask4, mask3, mask2, mask1);
+    /* fast path: one, two and three byte */
+    if (max < 0b11110000) {
+      vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl);
 
-      unrolled_popcount = _mm512_add_epi64(unrolled_popcount,
-                                           _mm512_popcnt_epi64(mask_register));
-    }
+      b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut);
+      b3 = __riscv_vand_vx_u8m2(b3, 0b00111111, vlOut);
 
-    for (; i <= max_i; i += sizeof(__m512i)) {
-      __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i));
-      uint64_t continuation_bitmask = static_cast<uint64_t>(
-          _mm512_cmple_epi8_mask(more_input, continuation));
-      answer -= count_ones(continuation_bitmask);
+      vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut);
+      vbool4_t m3 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b11011111, vlOut);
+
+      vuint8m2_t t1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut);
+      b1 = __riscv_vand_vx_u8m2_mu(m3, t1, b1, 15, vlOut);
+
+      vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4(
+          b1,
+          __riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1 << 6, m1,
+                                  vlOut),
+          vlOut);
+      b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut);
+      vuint16m4_t b123 = __riscv_vwaddu_wv_u16m4_mu(
+          m3, b12, __riscv_vsll_vx_u16m4_mu(m3, b12, b12, 6, vlOut), b3, vlOut);
+      if (is16)
+        __riscv_vse16_v_u16m4((uint16_t *)dst,
+                              simdutf_byteflip<bflip>(b123, vlOut), vlOut);
+      else
+        __riscv_vse32_v_u32m8((uint32_t *)dst,
+                              __riscv_vzext_vf2_u32m8(b123, vlOut), vlOut);
+      continue;
     }
-  }
 
-  __m256i first_half = _mm512_extracti64x4_epi64(unrolled_popcount, 0);
-  __m256i second_half = _mm512_extracti64x4_epi64(unrolled_popcount, 1);
-  answer -= (size_t)_mm256_extract_epi64(first_half, 0) +
-            (size_t)_mm256_extract_epi64(first_half, 1) +
-            (size_t)_mm256_extract_epi64(first_half, 2) +
-            (size_t)_mm256_extract_epi64(first_half, 3) +
-            (size_t)_mm256_extract_epi64(second_half, 0) +
-            (size_t)_mm256_extract_epi64(second_half, 1) +
-            (size_t)_mm256_extract_epi64(second_half, 2) +
-            (size_t)_mm256_extract_epi64(second_half, 3);
+    /* extract third and fourth bytes */
+    vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl);
+    vuint8m2_t b4 = __riscv_vcompress_vm_u8m2(v3, m, vl);
+
+    /* remove prefix from leading bytes
+     *
+     * We could also use vrgather here, but it increases register pressure,
+     * and its performance varies widely on current platforms. It might be
+     * worth reconsidering, though, once there is more hardware available.
+     * Same goes for the __riscv_vsrl_vv_u32m4 correction step.
+     *
+     * We shift left and then right by the number of bytes in the prefix,
+     * which can be calculated as follows:
+     *         x                                max(x-10, 0)
+     * 0xxx -> 0000-0111 -> sift by 0 or 1   -> 0
+     * 10xx -> 1000-1011 -> don't care
+     * 110x -> 1100,1101 -> sift by 3        -> 2,3
+     * 1110 -> 1110      -> sift by 4        -> 4
+     * 1111 -> 1111      -> sift by 5        -> 5
+     *
+     * vssubu.vx v, 10, (max(x-10, 0)) almost gives us what we want, we
+     * just need to manually detect and handle the one special case:
+     */
+#define SIMDUTF_RVV_UTF8_TO_COMMON_M1(idx)                                     \
+  vuint8m1_t c1 = __riscv_vget_v_u8m2_u8m1(b1, idx);                           \
+  vuint8m1_t c2 = __riscv_vget_v_u8m2_u8m1(b2, idx);                           \
+  vuint8m1_t c3 = __riscv_vget_v_u8m2_u8m1(b3, idx);                           \
+  vuint8m1_t c4 = __riscv_vget_v_u8m2_u8m1(b4, idx);                           \
+  /* remove prefix from trailing bytes */                                      \
+  c2 = __riscv_vand_vx_u8m1(c2, 0b00111111, vlOut);                            \
+  c3 = __riscv_vand_vx_u8m1(c3, 0b00111111, vlOut);                            \
+  c4 = __riscv_vand_vx_u8m1(c4, 0b00111111, vlOut);                            \
+  vuint8m1_t shift = __riscv_vsrl_vx_u8m1(c1, 4, vlOut);                       \
+  shift = __riscv_vmerge_vxm_u8m1(__riscv_vssubu_vx_u8m1(shift, 10, vlOut), 3, \
+                                  __riscv_vmseq_vx_u8m1_b8(shift, 12, vlOut),  \
+                                  vlOut);                                      \
+  c1 = __riscv_vsll_vv_u8m1(c1, shift, vlOut);                                 \
+  c1 = __riscv_vsrl_vv_u8m1(c1, shift, vlOut);                                 \
+  /* unconditionally widen and combine to c1234 */                             \
+  vuint16m2_t c34 = __riscv_vwaddu_wv_u16m2(                                   \
+      __riscv_vwmulu_vx_u16m2(c3, 1 << 6, vlOut), c4, vlOut);                  \
+  vuint16m2_t c12 = __riscv_vwaddu_wv_u16m2(                                   \
+      __riscv_vwmulu_vx_u16m2(c1, 1 << 6, vlOut), c2, vlOut);                  \
+  vuint32m4_t c1234 = __riscv_vwaddu_wv_u32m4(                                 \
+      __riscv_vwmulu_vx_u32m4(c12, 1 << 12, vlOut), c34, vlOut);               \
+  /* derive required right-shift amount from `shift` to reduce                 \
+   * c1234 to the required number of bytes */                                  \
+  c1234 = __riscv_vsrl_vv_u32m4(                                               \
+      c1234,                                                                   \
+      __riscv_vzext_vf4_u32m4(                                                 \
+          __riscv_vmul_vx_u8m1(                                                \
+              __riscv_vrsub_vx_u8m1(__riscv_vssubu_vx_u8m1(shift, 2, vlOut),   \
+                                    3, vlOut),                                 \
+              6, vlOut),                                                       \
+          vlOut),                                                              \
+      vlOut);                                                                  \
+  /* store result in desired format */                                         \
+  if (is16)                                                                    \
+    vlDst = rvv_utf32_store_utf16_m4<bflip>((uint16_t *)dst, c1234, vlOut,     \
+                                            m4even);                           \
+  else                                                                         \
+    vlDst = vlOut, __riscv_vse32_v_u32m4((uint32_t *)dst, c1234, vlOut);
 
-  return answer + scalar::utf8::count_code_points(
-                      reinterpret_cast<const char *>(str + i), length - i);
-}
+    /* Unrolling this manually reduces register pressure and allows
+     * us to terminate early. */
+    {
+      size_t vlOutm2 = vlOut, vlDst;
+      vlOut = __riscv_vsetvl_e8m1(vlOut);
+      SIMDUTF_RVV_UTF8_TO_COMMON_M1(0)
+      if (vlOutm2 == vlOut) {
+        vlOut = vlDst;
+        continue;
+      }
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
-    const char *buf, size_t len) const noexcept {
-  return count_utf8(buf, len);
-}
+      dst += vlDst;
+      vlOut = vlOutm2 - vlOut;
+    }
+    {
+      size_t vlDst;
+      SIMDUTF_RVV_UTF8_TO_COMMON_M1(1)
+      vlOut = vlDst;
+    }
 
-simdutf_warn_unused size_t
-implementation::latin1_length_from_utf16(size_t length) const noexcept {
-  return scalar::utf16::latin1_length_from_utf16(length);
-}
+#undef SIMDUTF_RVV_UTF8_TO_COMMON_M1
+  }
 
-simdutf_warn_unused size_t
-implementation::latin1_length_from_utf32(size_t length) const noexcept {
-  return scalar::utf32::latin1_length_from_utf32(length);
+  /* validate the last character and reparse it + tail */
+  if (len > tail) {
+    if ((src[0] >> 6) == 0b10)
+      --dst;
+    while ((src[0] >> 6) == 0b10 && tail < len)
+      --src, ++tail;
+    if (is16) {
+      /* go back one more, when on high surrogate */
+      if (simdutf_byteflip<bflip>((uint16_t)dst[-1]) >= 0xD800 &&
+          simdutf_byteflip<bflip>((uint16_t)dst[-1]) <= 0xDBFF)
+        --dst;
+    }
+  }
+  size_t ret = scalar(src, tail, dst);
+  if (ret == 0)
+    return 0;
+  return (size_t)(dst - beg) + ret;
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  const char16_t *ptr = input;
-  size_t count{0};
-  if (length >= 32) {
-    const char16_t *end = input + length - 32;
-
-    const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
-    const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
-    const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
-    const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
+    const char *src, size_t len, char *dst) const noexcept {
+  const char *beg = dst;
+  uint8_t last = 0;
+  for (size_t vl, vlOut; len > 0;
+       len -= vl, src += vl, dst += vlOut, last = src[-1]) {
+    vl = __riscv_vsetvl_e8m2(len);
+    vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
+    // check which bytes are ASCII
+    vbool4_t ascii = __riscv_vmsltu_vx_u8m2_b4(v1, 0b10000000, vl);
+    // count ASCII bytes
+    vlOut = __riscv_vcpop_m_b4(ascii, vl);
+    // The original code would only enter the next block after this check:
+    //   vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
+    //   vlOut = __riscv_vcpop_m_b4(m, vl);
+    //   if (vlOut != vl || last > 0b01111111) {...}q
+    // So that everything is ASCII or continuation bytes, we just proceeded
+    // without any processing, going straight to __riscv_vse8_v_u8m2.
+    // But you need the __riscv_vslide1up_vx_u8m2 whenever there is a non-ASCII
+    // byte.
+    if (vlOut != vl) { // If not pure ASCII
+      // Non-ASCII characters
+      // We now want to mark the ascii and continuation bytes
+      vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
+      // We count them, that's our new vlOut (output vector length)
+      vlOut = __riscv_vcpop_m_b4(m, vl);
 
-    while (ptr <= end) {
-      __m512i utf16 = _mm512_loadu_si512((const __m512i *)ptr);
-      ptr += 32;
-      __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
-      __mmask32 two_bytes_bitmask =
-          _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
-      __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
-      __mmask32 surrogates_bitmask =
-          _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) &
-          _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
+      vuint8m2_t v0 = __riscv_vslide1up_vx_u8m2(v1, last, vl);
 
-      size_t ascii_count = count_ones(ascii_bitmask);
-      size_t two_bytes_count = count_ones(two_bytes_bitmask);
-      size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
-      size_t three_bytes_count =
-          32 - ascii_count - two_bytes_count - surrogate_bytes_count;
+      vbool4_t leading0 = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b10111111, vl);
+      vbool4_t trailing1 = __riscv_vmslt_vx_i8m2_b4(
+          __riscv_vreinterpret_v_u8m2_i8m2(v1), (uint8_t)0b11000000, vl);
+      // -62 i 0b11000010, so we check whether any of v0 is too big
+      vbool4_t tobig = __riscv_vmand_mm_b4(
+          leading0,
+          __riscv_vmsgtu_vx_u8m2_b4(__riscv_vxor_vx_u8m2(v0, (uint8_t)-62, vl),
+                                    1, vl),
+          vl);
+      if (__riscv_vfirst_m_b4(
+              __riscv_vmor_mm_b4(
+                  tobig, __riscv_vmxor_mm_b4(leading0, trailing1, vl), vl),
+              vl) >= 0)
+        return 0;
 
-      count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count +
-               2 * surrogate_bytes_count;
+      v1 = __riscv_vor_vx_u8m2_mu(__riscv_vmseq_vx_u8m2_b4(v0, 0b11000011, vl),
+                                  v1, v1, 0b01000000, vl);
+      v1 = __riscv_vcompress_vm_u8m2(v1, m, vl);
+    } else if (last >= 0b11000000) { // If last byte is a leading  byte and we
+                                     // got only ASCII, error!
+      return 0;
     }
+    __riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut);
   }
-
-  return count + scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(
-                     ptr, length - (ptr - input));
+  if (last > 0b10111111)
+    return 0;
+  return dst - beg;
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  const char16_t *ptr = input;
-  size_t count{0};
-
-  if (length >= 32) {
-    const char16_t *end = input + length - 32;
-
-    const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
-    const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
-    const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
-    const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
-
-    const __m512i byteflip = _mm512_setr_epi64(
-        0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
-        0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
-        0x0607040502030001, 0x0e0f0c0d0a0b0809);
-    while (ptr <= end) {
-      __m512i utf16 = _mm512_loadu_si512((const __m512i *)ptr);
-      utf16 = _mm512_shuffle_epi8(utf16, byteflip);
-      ptr += 32;
-      __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
-      __mmask32 two_bytes_bitmask =
-          _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
-      __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
-      __mmask32 surrogates_bitmask =
-          _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) &
-          _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
+    const char *src, size_t len, char *dst) const noexcept {
+  size_t res = convert_utf8_to_latin1(src, len, dst);
+  if (res)
+    return result(error_code::SUCCESS, res);
+  return scalar::utf8_to_latin1::convert_with_errors(src, len, dst);
+}
 
-      size_t ascii_count = count_ones(ascii_bitmask);
-      size_t two_bytes_count = count_ones(two_bytes_bitmask);
-      size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
-      size_t three_bytes_count =
-          32 - ascii_count - two_bytes_count - surrogate_bytes_count;
-      count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count +
-               2 * surrogate_bytes_count;
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
+    const char *src, size_t len, char *dst) const noexcept {
+  const char *beg = dst;
+  uint8_t last = 0;
+  for (size_t vl, vlOut; len > 0;
+       len -= vl, src += vl, dst += vlOut, last = src[-1]) {
+    vl = __riscv_vsetvl_e8m2(len);
+    vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
+    vbool4_t ascii = __riscv_vmsltu_vx_u8m2_b4(v1, 0b10000000, vl);
+    vlOut = __riscv_vcpop_m_b4(ascii, vl);
+    if (vlOut != vl) { // If not pure ASCII
+      vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
+      vlOut = __riscv_vcpop_m_b4(m, vl);
+      vuint8m2_t v0 = __riscv_vslide1up_vx_u8m2(v1, last, vl);
+      v1 = __riscv_vor_vx_u8m2_mu(__riscv_vmseq_vx_u8m2_b4(v0, 0b11000011, vl),
+                                  v1, v1, 0b01000000, vl);
+      v1 = __riscv_vcompress_vm_u8m2(v1, m, vl);
     }
+    __riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut);
   }
-
-  return count + scalar::utf16::utf8_length_from_utf16<endianness::BIG>(
-                     ptr, length - (ptr - input));
+  return dst - beg;
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  return implementation::count_utf16le(input, length);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+    const char *src, size_t len, char16_t *dst) const noexcept {
+  return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::NONE>(src, len,
+                                                              (uint16_t *)dst);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  return implementation::count_utf16be(input, length);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+    const char *src, size_t len, char16_t *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::ZVBB>(
+        src, len, (uint16_t *)dst);
+  else
+    return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::V>(src, len,
+                                                             (uint16_t *)dst);
 }
 
-simdutf_warn_unused size_t
-implementation::utf16_length_from_latin1(size_t length) const noexcept {
-  return scalar::latin1::utf16_length_from_latin1(length);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+    const char *src, size_t len, char16_t *dst) const noexcept {
+  size_t res = convert_utf8_to_utf16le(src, len, dst);
+  if (res)
+    return result(error_code::SUCCESS, res);
+  return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(
+      src, len, dst);
 }
 
-simdutf_warn_unused size_t
-implementation::utf32_length_from_latin1(size_t length) const noexcept {
-  return scalar::latin1::utf32_length_from_latin1(length);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+    const char *src, size_t len, char16_t *dst) const noexcept {
+  size_t res = convert_utf8_to_utf16be(src, len, dst);
+  if (res)
+    return result(error_code::SUCCESS, res);
+  return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(src, len,
+                                                                     dst);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
-    const char *input, size_t length) const noexcept {
-  const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
-  size_t answer = length / sizeof(__m512i) * sizeof(__m512i);
-  size_t i = 0;
-  if (answer >= 2048) { // long strings optimization
-    unsigned char v_0xFF = 0xff;
-    __m512i eight_64bits = _mm512_setzero_si512();
-    while (i + sizeof(__m512i) <= length) {
-      __m512i runner = _mm512_setzero_si512();
-      size_t iterations = (length - i) / sizeof(__m512i);
-      if (iterations > 255) {
-        iterations = 255;
-      }
-      size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i);
-      for (; i + 4 * sizeof(__m512i) <= max_i; i += 4 * sizeof(__m512i)) {
-        // Load four __m512i vectors
-        __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i));
-        __m512i input2 =
-            _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i)));
-        __m512i input3 = _mm512_loadu_si512(
-            (const __m512i *)(str + i + 2 * sizeof(__m512i)));
-        __m512i input4 = _mm512_loadu_si512(
-            (const __m512i *)(str + i + 3 * sizeof(__m512i)));
-
-        // Generate four masks
-        __mmask64 mask1 =
-            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input1);
-        __mmask64 mask2 =
-            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input2);
-        __mmask64 mask3 =
-            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input3);
-        __mmask64 mask4 =
-            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input4);
-        // Apply the masks and subtract from the runner
-        __m512i not_ascii1 =
-            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask1, v_0xFF);
-        __m512i not_ascii2 =
-            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask2, v_0xFF);
-        __m512i not_ascii3 =
-            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask3, v_0xFF);
-        __m512i not_ascii4 =
-            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask4, v_0xFF);
-
-        runner = _mm512_sub_epi8(runner, not_ascii1);
-        runner = _mm512_sub_epi8(runner, not_ascii2);
-        runner = _mm512_sub_epi8(runner, not_ascii3);
-        runner = _mm512_sub_epi8(runner, not_ascii4);
-      }
-
-      for (; i <= max_i; i += sizeof(__m512i)) {
-        __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i));
-
-        __mmask64 mask =
-            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), more_input);
-        __m512i not_ascii =
-            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask, v_0xFF);
-        runner = _mm512_sub_epi8(runner, not_ascii);
-      }
-
-      eight_64bits = _mm512_add_epi64(
-          eight_64bits, _mm512_sad_epu8(runner, _mm512_setzero_si512()));
-    }
-
-    __m256i first_half = _mm512_extracti64x4_epi64(eight_64bits, 0);
-    __m256i second_half = _mm512_extracti64x4_epi64(eight_64bits, 1);
-    answer += (size_t)_mm256_extract_epi64(first_half, 0) +
-              (size_t)_mm256_extract_epi64(first_half, 1) +
-              (size_t)_mm256_extract_epi64(first_half, 2) +
-              (size_t)_mm256_extract_epi64(first_half, 3) +
-              (size_t)_mm256_extract_epi64(second_half, 0) +
-              (size_t)_mm256_extract_epi64(second_half, 1) +
-              (size_t)_mm256_extract_epi64(second_half, 2) +
-              (size_t)_mm256_extract_epi64(second_half, 3);
-  } else if (answer > 0) {
-    for (; i + sizeof(__m512i) <= length; i += sizeof(__m512i)) {
-      __m512i latin = _mm512_loadu_si512((const __m512i *)(str + i));
-      uint64_t non_ascii = _mm512_movepi8_mask(latin);
-      answer += count_ones(non_ascii);
-    }
-  }
-  return answer + scalar::latin1::utf8_length_from_latin1(
-                      reinterpret_cast<const char *>(str + i), length - i);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+    const char *src, size_t len, char16_t *dst) const noexcept {
+  return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::NONE, false>(
+      src, len, (uint16_t *)dst);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
-    const char *input, size_t length) const noexcept {
-  size_t pos = 0;
-  size_t count = 0;
-  // This algorithm could no doubt be improved!
-  for (; pos + 64 <= length; pos += 64) {
-    __m512i utf8 = _mm512_loadu_si512((const __m512i *)(input + pos));
-    uint64_t utf8_continuation_mask =
-        _mm512_cmplt_epi8_mask(utf8, _mm512_set1_epi8(-65 + 1));
-    // We count one word for anything that is not a continuation (so
-    // leading bytes).
-    count += 64 - count_ones(utf8_continuation_mask);
-    uint64_t utf8_4byte =
-        _mm512_cmpge_epu8_mask(utf8, _mm512_set1_epi8(int8_t(240)));
-    count += count_ones(utf8_4byte);
-  }
-  return count +
-         scalar::utf8::utf16_length_from_utf8(input + pos, length - pos);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+    const char *src, size_t len, char16_t *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::ZVBB, false>(
+        src, len, (uint16_t *)dst);
+  else
+    return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::V, false>(
+        src, len, (uint16_t *)dst);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
-    const char32_t *input, size_t length) const noexcept {
-  const char32_t *ptr = input;
-  size_t count{0};
-
-  if (length >= 16) {
-    const char32_t *end = input + length - 16;
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+    const char *src, size_t len, char32_t *dst) const noexcept {
+  return rvv_utf8_to_common<uint32_t, simdutf_ByteFlip::NONE>(src, len,
+                                                              (uint32_t *)dst);
+}
 
-    const __m512i v_0000_007f = _mm512_set1_epi32((uint32_t)0x7f);
-    const __m512i v_0000_07ff = _mm512_set1_epi32((uint32_t)0x7ff);
-    const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+    const char *src, size_t len, char32_t *dst) const noexcept {
+  size_t res = convert_utf8_to_utf32(src, len, dst);
+  if (res)
+    return result(error_code::SUCCESS, res);
+  return scalar::utf8_to_utf32::convert_with_errors(src, len, dst);
+}
 
-    while (ptr <= end) {
-      __m512i utf32 = _mm512_loadu_si512((const __m512i *)ptr);
-      ptr += 16;
-      __mmask16 ascii_bitmask = _mm512_cmple_epu32_mask(utf32, v_0000_007f);
-      __mmask16 two_bytes_bitmask = _mm512_mask_cmple_epu32_mask(
-          _knot_mask16(ascii_bitmask), utf32, v_0000_07ff);
-      __mmask16 three_bytes_bitmask = _mm512_mask_cmple_epu32_mask(
-          _knot_mask16(_mm512_kor(ascii_bitmask, two_bytes_bitmask)), utf32,
-          v_0000_ffff);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+    const char *src, size_t len, char32_t *dst) const noexcept {
+  return rvv_utf8_to_common<uint32_t, simdutf_ByteFlip::NONE, false>(
+      src, len, (uint32_t *)dst);
+}
+/* end file src/rvv/rvv_utf8_to.inl.cpp */
 
-      size_t ascii_count = count_ones(ascii_bitmask);
-      size_t two_bytes_count = count_ones(two_bytes_bitmask);
-      size_t three_bytes_count = count_ones(three_bytes_bitmask);
-      size_t four_bytes_count =
-          16 - ascii_count - two_bytes_count - three_bytes_count;
-      count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count +
-               4 * four_bytes_count;
-    }
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+                                 size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  if (bom_encoding != encoding_type::unspecified)
+    return bom_encoding;
+  // todo: reimplement as a one-pass algorithm.
+  int out = 0;
+  if (validate_utf8(input, length))
+    out |= encoding_type::UTF8;
+  if (length % 2 == 0) {
+    if (validate_utf16(reinterpret_cast<const char16_t *>(input), length / 2))
+      out |= encoding_type::UTF16_LE;
+  }
+  if (length % 4 == 0) {
+    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4))
+      out |= encoding_type::UTF32_LE;
   }
 
-  return count +
-         scalar::utf32::utf8_length_from_utf32(ptr, length - (ptr - input));
+  return out;
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
-    const char32_t *input, size_t length) const noexcept {
-  const char32_t *ptr = input;
-  size_t count{0};
-
-  if (length >= 16) {
-    const char32_t *end = input + length - 16;
-
-    const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
-
-    while (ptr <= end) {
-      __m512i utf32 = _mm512_loadu_si512((const __m512i *)ptr);
-      ptr += 16;
-      __mmask16 surrogates_bitmask =
-          _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
-
-      count += 16 + count_ones(surrogates_bitmask);
-    }
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static void
+rvv_change_endianness_utf16(const char16_t *src, size_t len, char16_t *dst) {
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e16m8(len);
+    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
+    __riscv_vse16_v_u16m8((uint16_t *)dst, simdutf_byteflip<bflip>(v, vl), vl);
   }
-
-  return count +
-         scalar::utf32::utf16_length_from_utf32(ptr, length - (ptr - input));
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
-    const char *input, size_t length) const noexcept {
-  return implementation::count_utf8(input, length);
+void implementation::change_endianness_utf16(const char16_t *src, size_t len,
+                                             char16_t *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_change_endianness_utf16<simdutf_ByteFlip::ZVBB>(src, len, dst);
+  else
+    return rvv_change_endianness_utf16<simdutf_ByteFlip::V>(src, len, dst);
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
@@ -25315,21 +37828,86 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
 simdutf_warn_unused result implementation::base64_to_binary(
     const char *input, size_t length, char *output, base64_options options,
     last_chunk_handling_options last_chunk_options) const noexcept {
-  return (options & base64_url)
-             ? compress_decode_base64<true>(output, input, length, options,
-                                            last_chunk_options)
-             : compress_decode_base64<false>(output, input, length, options,
-                                             last_chunk_options);
+  while (length > 0 &&
+         scalar::base64::is_ascii_white_space(input[length - 1])) {
+    length--;
+  }
+  size_t equallocation =
+      length; // location of the first padding character if any
+  size_t equalsigns = 0;
+  if (length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
+    length -= 1;
+    equalsigns++;
+    while (length > 0 &&
+           scalar::base64::is_ascii_white_space(input[length - 1])) {
+      length--;
+    }
+    if (length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
+      equalsigns++;
+      length -= 1;
+    }
+  }
+  if (length == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation};
+    }
+    return {SUCCESS, 0};
+  }
+  result r = scalar::base64::base64_tail_decode(
+      output, input, length, equalsigns, options, last_chunk_options);
+  if (last_chunk_options != stop_before_partial &&
+      r.error == error_code::SUCCESS && equalsigns > 0) {
+    // additional checks
+    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation};
+    }
+  }
+  return r;
 }
 
 simdutf_warn_unused full_result implementation::base64_to_binary_details(
     const char *input, size_t length, char *output, base64_options options,
     last_chunk_handling_options last_chunk_options) const noexcept {
-  return (options & base64_url)
-             ? compress_decode_base64<true>(output, input, length, options,
-                                            last_chunk_options)
-             : compress_decode_base64<false>(output, input, length, options,
-                                             last_chunk_options);
+  while (length > 0 &&
+         scalar::base64::is_ascii_white_space(input[length - 1])) {
+    length--;
+  }
+  size_t equallocation =
+      length; // location of the first padding character if any
+  size_t equalsigns = 0;
+  if (length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
+    length -= 1;
+    equalsigns++;
+    while (length > 0 &&
+           scalar::base64::is_ascii_white_space(input[length - 1])) {
+      length--;
+    }
+    if (length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
+      equalsigns++;
+      length -= 1;
+    }
+  }
+  if (length == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation, 0};
+    }
+    return {SUCCESS, 0, 0};
+  }
+  full_result r = scalar::base64::base64_tail_decode(
+      output, input, length, equalsigns, options, last_chunk_options);
+  if (last_chunk_options != stop_before_partial &&
+      r.error == error_code::SUCCESS && equalsigns > 0) {
+    // additional checks
+    if ((r.output_count % 3 == 0) ||
+        ((r.output_count % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
+    }
+  }
+  return r;
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
@@ -25340,21 +37918,86 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
 simdutf_warn_unused result implementation::base64_to_binary(
     const char16_t *input, size_t length, char *output, base64_options options,
     last_chunk_handling_options last_chunk_options) const noexcept {
-  return (options & base64_url)
-             ? compress_decode_base64<true>(output, input, length, options,
-                                            last_chunk_options)
-             : compress_decode_base64<false>(output, input, length, options,
-                                             last_chunk_options);
+  while (length > 0 &&
+         scalar::base64::is_ascii_white_space(input[length - 1])) {
+    length--;
+  }
+  size_t equallocation =
+      length; // location of the first padding character if any
+  auto equalsigns = 0;
+  if (length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
+    length -= 1;
+    equalsigns++;
+    while (length > 0 &&
+           scalar::base64::is_ascii_white_space(input[length - 1])) {
+      length--;
+    }
+    if (length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
+      equalsigns++;
+      length -= 1;
+    }
+  }
+  if (length == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation};
+    }
+    return {SUCCESS, 0};
+  }
+  result r = scalar::base64::base64_tail_decode(
+      output, input, length, equalsigns, options, last_chunk_options);
+  if (last_chunk_options != stop_before_partial &&
+      r.error == error_code::SUCCESS && equalsigns > 0) {
+    // additional checks
+    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation};
+    }
+  }
+  return r;
 }
 
 simdutf_warn_unused full_result implementation::base64_to_binary_details(
     const char16_t *input, size_t length, char *output, base64_options options,
     last_chunk_handling_options last_chunk_options) const noexcept {
-  return (options & base64_url)
-             ? compress_decode_base64<true>(output, input, length, options,
-                                            last_chunk_options)
-             : compress_decode_base64<false>(output, input, length, options,
-                                             last_chunk_options);
+  while (length > 0 &&
+         scalar::base64::is_ascii_white_space(input[length - 1])) {
+    length--;
+  }
+  size_t equallocation =
+      length; // location of the first padding character if any
+  size_t equalsigns = 0;
+  if (length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
+    length -= 1;
+    equalsigns++;
+    while (length > 0 &&
+           scalar::base64::is_ascii_white_space(input[length - 1])) {
+      length--;
+    }
+    if (length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
+      equalsigns++;
+      length -= 1;
+    }
+  }
+  if (length == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation, 0};
+    }
+    return {SUCCESS, 0, 0};
+  }
+  full_result r = scalar::base64::base64_tail_decode(
+      output, input, length, equalsigns, options, last_chunk_options);
+  if (last_chunk_options != stop_before_partial &&
+      r.error == error_code::SUCCESS && equalsigns > 0) {
+    // additional checks
+    if ((r.output_count % 3 == 0) ||
+        ((r.output_count % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
+    }
+  }
+  return r;
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(
@@ -25365,56 +38008,38 @@ simdutf_warn_unused size_t implementation::base64_length_from_binary(
 size_t implementation::binary_to_base64(const char *input, size_t length,
                                         char *output,
                                         base64_options options) const noexcept {
-  if (options & base64_url) {
-    return encode_base64<true>(output, input, length, options);
-  } else {
-    return encode_base64<false>(output, input, length, options);
-  }
+  return scalar::base64::tail_encode_base64(output, input, length, options);
 }
-
-} // namespace icelake
+} // namespace rvv
 } // namespace simdutf
 
-/* begin file src/simdutf/icelake/end.h */
-#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
+/* begin file src/simdutf/rvv/end.h */
+#if SIMDUTF_CAN_ALWAYS_RUN_RVV
 // nothing needed.
 #else
 SIMDUTF_UNTARGET_REGION
 #endif
 
-
-#if SIMDUTF_GCC11ORMORE // workaround for
-                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
-SIMDUTF_POP_DISABLE_WARNINGS
-#endif // end of workaround
-/* end file src/simdutf/icelake/end.h */
-/* end file src/icelake/implementation.cpp */
+/* end file src/simdutf/rvv/end.h */
+/* end file src/rvv/implementation.cpp */
 #endif
-#if SIMDUTF_IMPLEMENTATION_HASWELL
-/* begin file src/haswell/implementation.cpp */
-
-/* begin file src/simdutf/haswell/begin.h */
-// redefining SIMDUTF_IMPLEMENTATION to "haswell"
-// #define SIMDUTF_IMPLEMENTATION haswell
+#if SIMDUTF_IMPLEMENTATION_WESTMERE
+/* begin file src/westmere/implementation.cpp */
+/* begin file src/simdutf/westmere/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "westmere"
+// #define SIMDUTF_IMPLEMENTATION westmere
 
-#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
+#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
 // nothing needed.
 #else
-SIMDUTF_TARGET_HASWELL
+SIMDUTF_TARGET_WESTMERE
 #endif
-
-#if SIMDUTF_GCC11ORMORE // workaround for
-                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
-// clang-format off
-SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
-// clang-format on
-#endif // end of workaround
-/* end file src/simdutf/haswell/begin.h */
+/* end file src/simdutf/westmere/begin.h */
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
-#ifndef SIMDUTF_HASWELL_H
-  #error "haswell.h must be included"
+#ifndef SIMDUTF_WESTMERE_H
+  #error "westmere.h must be included"
 #endif
 using namespace simd;
 
@@ -25441,13 +38066,90 @@ simdutf_really_inline simd8<bool>
 must_be_2_3_continuation(const simd8<uint8_t> prev2,
                          const simd8<uint8_t> prev3) {
   simd8<uint8_t> is_third_byte =
-      prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be > 0x80
+      prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be >= 0x80
   simd8<uint8_t> is_fourth_byte =
-      prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be > 0x80
+      prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be >= 0x80
   return simd8<bool>(is_third_byte | is_fourth_byte);
 }
 
-/* begin file src/haswell/avx2_validate_utf16.cpp */
+/* begin file src/westmere/internal/loader.cpp */
+namespace internal {
+namespace westmere {
+
+/* begin file src/westmere/internal/write_v_u16_11bits_to_utf8.cpp */
+/*
+ * reads a vector of uint16 values
+ * bits after 11th are ignored
+ * first 11 bits are encoded into utf8
+ * !important! utf8_output must have at least 16 writable bytes
+ */
+
+inline void write_v_u16_11bits_to_utf8(const __m128i v_u16, char *&utf8_output,
+                                       const __m128i one_byte_bytemask,
+                                       const uint16_t one_byte_bitmask) {
+  // 0b1100_0000_1000_0000
+  const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
+  // 0b0001_1111_0000_0000
+  const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
+  // 0b0000_0000_0011_1111
+  const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
+
+  // 1. prepare 2-byte values
+  // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+  // expected output   : [110a|aaaa|10bb|bbbb] x 8
+
+  // t0 = [000a|aaaa|bbbb|bb00]
+  const __m128i t0 = _mm_slli_epi16(v_u16, 2);
+  // t1 = [000a|aaaa|0000|0000]
+  const __m128i t1 = _mm_and_si128(t0, v_1f00);
+  // t2 = [0000|0000|00bb|bbbb]
+  const __m128i t2 = _mm_and_si128(v_u16, v_003f);
+  // t3 = [000a|aaaa|00bb|bbbb]
+  const __m128i t3 = _mm_or_si128(t1, t2);
+  // t4 = [110a|aaaa|10bb|bbbb]
+  const __m128i t4 = _mm_or_si128(t3, v_c080);
+
+  // 2. merge ASCII and 2-byte codewords
+  const __m128i utf8_unpacked = _mm_blendv_epi8(t4, v_u16, one_byte_bytemask);
+
+  // 3. prepare bitmask for 8-bit lookup
+  //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a
+  //    - LSB)
+  const uint16_t m0 = one_byte_bitmask & 0x5555;      // m0 = 0h0g0f0e0d0c0b0a
+  const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
+  const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 = hdgcfbea
+  // 4. pack the bytes
+  const uint8_t *row =
+      &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+  const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+  const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
+
+  // 5. store bytes
+  _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+
+  // 6. adjust pointers
+  utf8_output += row[0];
+}
+
+inline void write_v_u16_11bits_to_utf8(const __m128i v_u16, char *&utf8_output,
+                                       const __m128i v_0000,
+                                       const __m128i v_ff80) {
+  // no bits set above 7th bit
+  const __m128i one_byte_bytemask =
+      _mm_cmpeq_epi16(_mm_and_si128(v_u16, v_ff80), v_0000);
+  const uint16_t one_byte_bitmask =
+      static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
+
+  write_v_u16_11bits_to_utf8(v_u16, utf8_output, one_byte_bytemask,
+                             one_byte_bitmask);
+}
+/* end file src/westmere/internal/write_v_u16_11bits_to_utf8.cpp */
+
+} // namespace westmere
+} // namespace internal
+/* end file src/westmere/internal/loader.cpp */
+
+/* begin file src/westmere/sse_validate_utf16.cpp */
 /*
     In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning.
 
@@ -25468,7 +38170,7 @@ must_be_2_3_continuation(const simd8<uint8_t> prev2,
     - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
     - there must not be sole low surrogate nor high surrogate
 
-    We're going to build three bitmasks based on the 3rd nibble:
+    We are going to build three bitmasks based on the 3rd nibble:
     - V = valid word,
     - L = low surrogate (0xd800 .. 0xdbff)
     - H = high surrogate (0xdc00 .. 0xdfff)
@@ -25495,7 +38197,7 @@ must_be_2_3_continuation(const simd8<uint8_t> prev2,
    - nullptr if an error was detected.
 */
 template <endianness big_endian>
-const char16_t *avx2_validate_utf16(const char16_t *input, size_t size) {
+const char16_t *sse_validate_utf16(const char16_t *input, size_t size) {
   const char16_t *end = input + size;
 
   const auto v_d8 = simd8<uint8_t>::splat(0xd8);
@@ -25503,13 +38205,13 @@ const char16_t *avx2_validate_utf16(const char16_t *input, size_t size) {
   const auto v_fc = simd8<uint8_t>::splat(0xfc);
   const auto v_dc = simd8<uint8_t>::splat(0xdc);
 
-  while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
+  while (input + simd16<uint16_t>::SIZE * 2 < end) {
     // 0. Load data: since the validation takes into account only higher
     //    byte of each word, we compress the two vectors into one which
     //    consists only the higher bytes.
     auto in0 = simd16<uint16_t>(input);
-    auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
-
+    auto in1 =
+        simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
     if (big_endian) {
       in0 = in0.swap_bytes();
       in1 = in1.swap_bytes();
@@ -25522,9 +38224,10 @@ const char16_t *avx2_validate_utf16(const char16_t *input, size_t size) {
 
     // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
     const auto surrogates_wordmask = (in & v_f8) == v_d8;
-    const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
-    if (surrogates_bitmask == 0x0) {
-      input += simd16<uint16_t>::ELEMENTS * 2;
+    const uint16_t surrogates_bitmask =
+        static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
+    if (surrogates_bitmask == 0x0000) {
+      input += 16;
     } else {
       // 2. We have some surrogates that have to be distinguished:
       //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
@@ -25534,35 +38237,36 @@ const char16_t *avx2_validate_utf16(const char16_t *input, size_t size) {
 
       // V - non-surrogate code units
       //     V = not surrogates_wordmask
-      const uint32_t V = ~surrogates_bitmask;
+      const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
 
       // H - word-mask for high surrogates: the six highest bits are 0b1101'11
       const auto vH = (in & v_fc) == v_dc;
-      const uint32_t H = vH.to_bitmask();
+      const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
 
       // L - word mask for low surrogates
       //     L = not H and surrogates_wordmask
-      const uint32_t L = ~H & surrogates_bitmask;
+      const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
 
-      const uint32_t a =
-          L & (H >> 1); // A low surrogate must be followed by high one.
-                        // (A low surrogate placed in the 7th register's word
-                        // is an exception we handle.)
-      const uint32_t b =
-          a << 1; // Just mark that the opposite fact is hold,
-                  // thanks to that we have only two masks for valid case.
-      const uint32_t c = V | a | b; // Combine all the masks into the final one.
+      const uint16_t a = static_cast<uint16_t>(
+          L & (H >> 1)); // A low surrogate must be followed by high one.
+                         // (A low surrogate placed in the 7th register's word
+                         // is an exception we handle.)
+      const uint16_t b = static_cast<uint16_t>(
+          a << 1); // Just mark that the opinput - startite fact is hold,
+                   // thanks to that we have only two masks for valid case.
+      const uint16_t c = static_cast<uint16_t>(
+          V | a | b); // Combine all the masks into the final one.
 
-      if (c == 0xffffffff) {
+      if (c == 0xffff) {
         // The whole input register contains valid UTF-16, i.e.,
         // either single code units or proper surrogate pairs.
-        input += simd16<uint16_t>::ELEMENTS * 2;
-      } else if (c == 0x7fffffff) {
-        // The 31 lower code units of the input register contains valid UTF-16.
-        // The 31 word may be either a low or high surrogate. It the next
+        input += 16;
+      } else if (c == 0x7fff) {
+        // The 15 lower code units of the input register contains valid UTF-16.
+        // The 15th word may be either a low or high surrogate. It the next
         // iteration we 1) check if the low surrogate is followed by a high
         // one, 2) reject sole high surrogate.
-        input += simd16<uint16_t>::ELEMENTS * 2 - 1;
+        input += 15;
       } else {
         return nullptr;
       }
@@ -25573,8 +38277,8 @@ const char16_t *avx2_validate_utf16(const char16_t *input, size_t size) {
 }
 
 template <endianness big_endian>
-const result avx2_validate_utf16_with_errors(const char16_t *input,
-                                             size_t size) {
+const result sse_validate_utf16_with_errors(const char16_t *input,
+                                            size_t size) {
   if (simdutf_unlikely(size == 0)) {
     return result(error_code::SUCCESS, 0);
   }
@@ -25586,12 +38290,13 @@ const result avx2_validate_utf16_with_errors(const char16_t *input,
   const auto v_fc = simd8<uint8_t>::splat(0xfc);
   const auto v_dc = simd8<uint8_t>::splat(0xdc);
 
-  while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
+  while (input + simd16<uint16_t>::SIZE * 2 < end) {
     // 0. Load data: since the validation takes into account only higher
     //    byte of each word, we compress the two vectors into one which
     //    consists only the higher bytes.
     auto in0 = simd16<uint16_t>(input);
-    auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
+    auto in1 =
+        simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
 
     if (big_endian) {
       in0 = in0.swap_bytes();
@@ -25605,9 +38310,10 @@ const result avx2_validate_utf16_with_errors(const char16_t *input,
 
     // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
     const auto surrogates_wordmask = (in & v_f8) == v_d8;
-    const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
-    if (surrogates_bitmask == 0x0) {
-      input += simd16<uint16_t>::ELEMENTS * 2;
+    const uint16_t surrogates_bitmask =
+        static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
+    if (surrogates_bitmask == 0x0000) {
+      input += 16;
     } else {
       // 2. We have some surrogates that have to be distinguished:
       //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
@@ -25617,35 +38323,36 @@ const result avx2_validate_utf16_with_errors(const char16_t *input,
 
       // V - non-surrogate code units
       //     V = not surrogates_wordmask
-      const uint32_t V = ~surrogates_bitmask;
+      const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
 
       // H - word-mask for high surrogates: the six highest bits are 0b1101'11
       const auto vH = (in & v_fc) == v_dc;
-      const uint32_t H = vH.to_bitmask();
+      const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
 
       // L - word mask for low surrogates
       //     L = not H and surrogates_wordmask
-      const uint32_t L = ~H & surrogates_bitmask;
+      const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
 
-      const uint32_t a =
-          L & (H >> 1); // A low surrogate must be followed by high one.
-                        // (A low surrogate placed in the 7th register's word
-                        // is an exception we handle.)
-      const uint32_t b =
-          a << 1; // Just mark that the opposite fact is hold,
-                  // thanks to that we have only two masks for valid case.
-      const uint32_t c = V | a | b; // Combine all the masks into the final one.
+      const uint16_t a = static_cast<uint16_t>(
+          L & (H >> 1)); // A low surrogate must be followed by high one.
+                         // (A low surrogate placed in the 7th register's word
+                         // is an exception we handle.)
+      const uint16_t b = static_cast<uint16_t>(
+          a << 1); // Just mark that the opinput - startite fact is hold,
+                   // thanks to that we have only two masks for valid case.
+      const uint16_t c = static_cast<uint16_t>(
+          V | a | b); // Combine all the masks into the final one.
 
-      if (c == 0xffffffff) {
+      if (c == 0xffff) {
         // The whole input register contains valid UTF-16, i.e.,
         // either single code units or proper surrogate pairs.
-        input += simd16<uint16_t>::ELEMENTS * 2;
-      } else if (c == 0x7fffffff) {
-        // The 31 lower code units of the input register contains valid UTF-16.
-        // The 31 word may be either a low or high surrogate. It the next
+        input += 16;
+      } else if (c == 0x7fff) {
+        // The 15 lower code units of the input register contains valid UTF-16.
+        // The 15th word may be either a low or high surrogate. It the next
         // iteration we 1) check if the low surrogate is followed by a high
         // one, 2) reject sole high surrogate.
-        input += simd16<uint16_t>::ELEMENTS * 2 - 1;
+        input += 15;
       } else {
         return result(error_code::SURROGATE, input - start);
       }
@@ -25654,228 +38361,210 @@ const result avx2_validate_utf16_with_errors(const char16_t *input,
 
   return result(error_code::SUCCESS, input - start);
 }
-/* end file src/haswell/avx2_validate_utf16.cpp */
-/* begin file src/haswell/avx2_validate_utf32le.cpp */
+/* end file src/westmere/sse_validate_utf16.cpp */
+/* begin file src/westmere/sse_validate_utf32le.cpp */
 /* Returns:
    - pointer to the last unprocessed character (a scalar fallback should check
    the rest);
    - nullptr if an error was detected.
 */
-const char32_t *avx2_validate_utf32le(const char32_t *input, size_t size) {
+const char32_t *sse_validate_utf32le(const char32_t *input, size_t size) {
   const char32_t *end = input + size;
 
-  const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
-  const __m256i offset = _mm256_set1_epi32(0xffff2000);
-  const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
-  __m256i currentmax = _mm256_setzero_si256();
-  __m256i currentoffsetmax = _mm256_setzero_si256();
+  const __m128i standardmax = _mm_set1_epi32(0x10ffff);
+  const __m128i offset = _mm_set1_epi32(0xffff2000);
+  const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
+  __m128i currentmax = _mm_setzero_si128();
+  __m128i currentoffsetmax = _mm_setzero_si128();
 
-  while (input + 8 < end) {
-    const __m256i in = _mm256_loadu_si256((__m256i *)input);
-    currentmax = _mm256_max_epu32(in, currentmax);
+  while (input + 4 < end) {
+    const __m128i in = _mm_loadu_si128((__m128i *)input);
+    currentmax = _mm_max_epu32(in, currentmax);
     currentoffsetmax =
-        _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
-    input += 8;
+        _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
+    input += 4;
   }
-  __m256i is_zero =
-      _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
-  if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+  __m128i is_zero =
+      _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
+  if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
     return nullptr;
   }
 
-  is_zero = _mm256_xor_si256(
-      _mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-  if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+  is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax),
+                          standardoffsetmax);
+  if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
     return nullptr;
   }
 
   return input;
 }
 
-const result avx2_validate_utf32le_with_errors(const char32_t *input,
-                                               size_t size) {
+const result sse_validate_utf32le_with_errors(const char32_t *input,
+                                              size_t size) {
   const char32_t *start = input;
   const char32_t *end = input + size;
 
-  const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
-  const __m256i offset = _mm256_set1_epi32(0xffff2000);
-  const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
-  __m256i currentmax = _mm256_setzero_si256();
-  __m256i currentoffsetmax = _mm256_setzero_si256();
+  const __m128i standardmax = _mm_set1_epi32(0x10ffff);
+  const __m128i offset = _mm_set1_epi32(0xffff2000);
+  const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
+  __m128i currentmax = _mm_setzero_si128();
+  __m128i currentoffsetmax = _mm_setzero_si128();
 
-  while (input + 8 < end) {
-    const __m256i in = _mm256_loadu_si256((__m256i *)input);
-    currentmax = _mm256_max_epu32(in, currentmax);
+  while (input + 4 < end) {
+    const __m128i in = _mm_loadu_si128((__m128i *)input);
+    currentmax = _mm_max_epu32(in, currentmax);
     currentoffsetmax =
-        _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
+        _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
 
-    __m256i is_zero = _mm256_xor_si256(
-        _mm256_max_epu32(currentmax, standardmax), standardmax);
-    if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+    __m128i is_zero =
+        _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
+    if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
       return result(error_code::TOO_LARGE, input - start);
     }
 
-    is_zero =
-        _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax),
-                         standardoffsetmax);
-    if (_mm256_testz_si256(is_zero, is_zero) == 0) {
-      return result(error_code::SURROGATE, input - start);
-    }
-    input += 8;
-  }
-
-  return result(error_code::SUCCESS, input - start);
-}
-/* end file src/haswell/avx2_validate_utf32le.cpp */
-
-/* begin file src/haswell/avx2_convert_latin1_to_utf8.cpp */
-std::pair<const char *, char *>
-avx2_convert_latin1_to_utf8(const char *latin1_input, size_t len,
-                            char *utf8_output) {
-  const char *end = latin1_input + len;
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
-  const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
-  const size_t safety_margin = 12;
-
-  while (end - latin1_input >= std::ptrdiff_t(16 + safety_margin)) {
-    __m128i in8 = _mm_loadu_si128((__m128i *)latin1_input);
-    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
-    const __m128i v_80 = _mm_set1_epi8((char)0x80);
-    if (_mm_testz_si128(in8, v_80)) { // ASCII fast path!!!!
-      // 1. store (16 bytes)
-      _mm_storeu_si128((__m128i *)utf8_output, in8);
-      // 2. adjust pointers
-      latin1_input += 16;
-      utf8_output += 16;
-      continue; // we are done for this round!
-    }
-    // We proceed only with the first 16 bytes.
-    const __m256i in = _mm256_cvtepu8_epi16((in8));
-
-    // 1. prepare 2-byte values
-    // input 16-bit word : [0000|0000|aabb|bbbb] x 8
-    // expected output   : [1100|00aa|10bb|bbbb] x 8
-    const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-    const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-    // t0 = [0000|00aa|bbbb|bb00]
-    const __m256i t0 = _mm256_slli_epi16(in, 2);
-    // t1 = [0000|00aa|0000|0000]
-    const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-    // t2 = [0000|0000|00bb|bbbb]
-    const __m256i t2 = _mm256_and_si256(in, v_003f);
-    // t3 = [000a|aaaa|00bb|bbbb]
-    const __m256i t3 = _mm256_or_si256(t1, t2);
-    // t4 = [1100|00aa|10bb|bbbb]
-    const __m256i t4 = _mm256_or_si256(t3, v_c080);
-
-    // 2. merge ASCII and 2-byte codewords
-
-    // no bits set above 7th bit
-    const __m256i one_byte_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-    const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
-
-    // 3. prepare bitmask for 8-bit lookup
-    const uint32_t M0 = one_byte_bitmask & 0x55555555;
-    const uint32_t M1 = M0 >> 7;
-    const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
-    // 4. pack the bytes
-
-    const uint8_t *row =
-        &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-    const uint8_t *row_2 =
-        &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)]
-                                                            [0];
-
-    const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
-    const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
-
-    const __m256i utf8_packed = _mm256_shuffle_epi8(
-        utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
-    // 5. store bytes
-    _mm_storeu_si128((__m128i *)utf8_output,
-                     _mm256_castsi256_si128(utf8_packed));
-    utf8_output += row[0];
-    _mm_storeu_si128((__m128i *)utf8_output,
-                     _mm256_extractf128_si256(utf8_packed, 1));
-    utf8_output += row_2[0];
-
-    // 6. adjust pointers
-    latin1_input += 16;
-    continue;
-
-  } // while
-  return std::make_pair(latin1_input, utf8_output);
+    is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax),
+                            standardoffsetmax);
+    if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
+      return result(error_code::SURROGATE, input - start);
+    }
+    input += 4;
+  }
+
+  return result(error_code::SUCCESS, input - start);
 }
-/* end file src/haswell/avx2_convert_latin1_to_utf8.cpp */
-/* begin file src/haswell/avx2_convert_latin1_to_utf16.cpp */
-template <endianness big_endian>
-std::pair<const char *, char16_t *>
-avx2_convert_latin1_to_utf16(const char *latin1_input, size_t len,
-                             char16_t *utf16_output) {
-  size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 32
+/* end file src/westmere/sse_validate_utf32le.cpp */
 
-  size_t i = 0;
-  for (; i < rounded_len; i += 16) {
-    // Load 16 bytes from the address (input + i) into a xmm register
-    __m128i xmm0 =
-        _mm_loadu_si128(reinterpret_cast<const __m128i *>(latin1_input + i));
+/* begin file src/westmere/sse_convert_latin1_to_utf8.cpp */
+std::pair<const char *const, char *const>
+sse_convert_latin1_to_utf8(const char *latin_input,
+                           const size_t latin_input_length, char *utf8_output) {
+  const char *end = latin_input + latin_input_length;
 
-    // Zero extend each byte in xmm0 to word and put it in another xmm register
-    __m128i xmm1 = _mm_cvtepu8_epi16(xmm0);
+  const __m128i v_0000 = _mm_setzero_si128();
+  // 0b1000_0000
+  const __m128i v_80 = _mm_set1_epi8((uint8_t)0x80);
+  // 0b1111_1111_1000_0000
+  const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
 
-    // Shift xmm0 to the right by 8 bytes
-    xmm0 = _mm_srli_si128(xmm0, 8);
+  const __m128i latin_1_half_into_u16_byte_mask =
+      _mm_setr_epi8(0, '\x80', 1, '\x80', 2, '\x80', 3, '\x80', 4, '\x80', 5,
+                    '\x80', 6, '\x80', 7, '\x80');
 
-    // Zero extend each byte in the shifted xmm0 to word in xmm0
-    xmm0 = _mm_cvtepu8_epi16(xmm0);
+  const __m128i latin_2_half_into_u16_byte_mask =
+      _mm_setr_epi8(8, '\x80', 9, '\x80', 10, '\x80', 11, '\x80', 12, '\x80',
+                    13, '\x80', 14, '\x80', 15, '\x80');
 
-    if (big_endian) {
-      const __m128i swap =
-          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      xmm0 = _mm_shuffle_epi8(xmm0, swap);
-      xmm1 = _mm_shuffle_epi8(xmm1, swap);
+  // each latin1 takes 1-2 utf8 bytes
+  // slow path writes useful 8-15 bytes twice (eagerly writes 16 bytes and then
+  // adjust the pointer) so the last write can exceed the utf8_output size by
+  // 8-1 bytes by reserving 8 extra input bytes, we expect the output to have
+  // 8-16 bytes free
+  while (end - latin_input >= 16 + 8) {
+    // Load 16 Latin1 characters (16 bytes) into a 128-bit register
+    __m128i v_latin = _mm_loadu_si128((__m128i *)latin_input);
+
+    if (_mm_testz_si128(v_latin, v_80)) { // ASCII fast path!!!!
+      _mm_storeu_si128((__m128i *)utf8_output, v_latin);
+      latin_input += 16;
+      utf8_output += 16;
+      continue;
     }
 
-    // Store the contents of xmm1 into the address pointed by (output + i)
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + i), xmm1);
+    // assuming a/b are bytes and A/B are uint16 of the same value
+    // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
+    __m128i v_u16_latin_1_half =
+        _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask);
+    // aaaa_aaaa_bbbb_bbbb -> BBBB_BBBB
+    __m128i v_u16_latin_2_half =
+        _mm_shuffle_epi8(v_latin, latin_2_half_into_u16_byte_mask);
 
-    // Store the contents of xmm0 into the address pointed by (output + i + 8)
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + i + 8), xmm0);
+    internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_1_half,
+                                                   utf8_output, v_0000, v_ff80);
+    internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_2_half,
+                                                   utf8_output, v_0000, v_ff80);
+    latin_input += 16;
+  }
+
+  if (end - latin_input >= 16) {
+    // Load 16 Latin1 characters (16 bytes) into a 128-bit register
+    __m128i v_latin = _mm_loadu_si128((__m128i *)latin_input);
+
+    if (_mm_testz_si128(v_latin, v_80)) { // ASCII fast path!!!!
+      _mm_storeu_si128((__m128i *)utf8_output, v_latin);
+      latin_input += 16;
+      utf8_output += 16;
+    } else {
+      // assuming a/b are bytes and A/B are uint16 of the same value
+      // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
+      __m128i v_u16_latin_1_half =
+          _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask);
+      internal::westmere::write_v_u16_11bits_to_utf8(
+          v_u16_latin_1_half, utf8_output, v_0000, v_ff80);
+      latin_input += 8;
+    }
   }
 
+  return std::make_pair(latin_input, utf8_output);
+}
+/* end file src/westmere/sse_convert_latin1_to_utf8.cpp */
+/* begin file src/westmere/sse_convert_latin1_to_utf16.cpp */
+template <endianness big_endian>
+std::pair<const char *, char16_t *>
+sse_convert_latin1_to_utf16(const char *latin1_input, size_t len,
+                            char16_t *utf16_output) {
+  size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
+  for (size_t i = 0; i < rounded_len; i += 16) {
+    // Load 16 Latin1 characters into a 128-bit register
+    __m128i in =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(&latin1_input[i]));
+    __m128i out1 = big_endian ? _mm_unpacklo_epi8(_mm_setzero_si128(), in)
+                              : _mm_unpacklo_epi8(in, _mm_setzero_si128());
+    __m128i out2 = big_endian ? _mm_unpackhi_epi8(_mm_setzero_si128(), in)
+                              : _mm_unpackhi_epi8(in, _mm_setzero_si128());
+    // Zero extend each Latin1 character to 16-bit integers and store the
+    // results back to memory
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(&utf16_output[i]), out1);
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(&utf16_output[i + 8]), out2);
+  }
+  // return pointers pointing to where we left off
   return std::make_pair(latin1_input + rounded_len, utf16_output + rounded_len);
 }
-/* end file src/haswell/avx2_convert_latin1_to_utf16.cpp */
-/* begin file src/haswell/avx2_convert_latin1_to_utf32.cpp */
+/* end file src/westmere/sse_convert_latin1_to_utf16.cpp */
+/* begin file src/westmere/sse_convert_latin1_to_utf32.cpp */
 std::pair<const char *, char32_t *>
-avx2_convert_latin1_to_utf32(const char *buf, size_t len,
-                             char32_t *utf32_output) {
-  size_t rounded_len = ((len | 7) ^ 7); // Round down to nearest multiple of 8
+sse_convert_latin1_to_utf32(const char *buf, size_t len,
+                            char32_t *utf32_output) {
+  const char *end = buf + len;
 
-  for (size_t i = 0; i < rounded_len; i += 8) {
-    // Load 8 Latin1 characters into a 64-bit register
-    __m128i in = _mm_loadl_epi64((__m128i *)&buf[i]);
+  while (end - buf >= 16) {
+    // Load 16 Latin1 characters (16 bytes) into a 128-bit register
+    __m128i in = _mm_loadu_si128((__m128i *)buf);
 
-    // Zero extend each set of 8 Latin1 characters to 8 32-bit integers using
-    // vpmovzxbd
-    __m256i out = _mm256_cvtepu8_epi32(in);
+    // Shift input to process next 4 bytes
+    __m128i in_shifted1 = _mm_srli_si128(in, 4);
+    __m128i in_shifted2 = _mm_srli_si128(in, 8);
+    __m128i in_shifted3 = _mm_srli_si128(in, 12);
 
-    // Store the results back to memory
-    _mm256_storeu_si256((__m256i *)&utf32_output[i], out);
+    // expand 8-bit to 32-bit unit
+    __m128i out1 = _mm_cvtepu8_epi32(in);
+    __m128i out2 = _mm_cvtepu8_epi32(in_shifted1);
+    __m128i out3 = _mm_cvtepu8_epi32(in_shifted2);
+    __m128i out4 = _mm_cvtepu8_epi32(in_shifted3);
+
+    _mm_storeu_si128((__m128i *)utf32_output, out1);
+    _mm_storeu_si128((__m128i *)(utf32_output + 4), out2);
+    _mm_storeu_si128((__m128i *)(utf32_output + 8), out3);
+    _mm_storeu_si128((__m128i *)(utf32_output + 12), out4);
+
+    utf32_output += 16;
+    buf += 16;
   }
 
-  // return pointers pointing to where we left off
-  return std::make_pair(buf + rounded_len, utf32_output + rounded_len);
+  return std::make_pair(buf, utf32_output);
 }
-/* end file src/haswell/avx2_convert_latin1_to_utf32.cpp */
+/* end file src/westmere/sse_convert_latin1_to_utf32.cpp */
 
-/* begin file src/haswell/avx2_convert_utf8_to_utf16.cpp */
+/* begin file src/westmere/sse_convert_utf8_to_utf16.cpp */
 // depends on "tables/utf8_to_utf16_tables.h"
 
 // Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
@@ -25904,18 +38593,20 @@ size_t convert_masked_utf8_to_utf16(const char *input,
       utf8_end_of_code_point_mask & 0xfff;
   if (utf8_end_of_code_point_mask == 0xfff) {
     // We process the data in chunks of 12 bytes.
-    __m256i ascii = _mm256_cvtepu8_epi16(in);
+    // Note: using 16 bytes is unsafe, see issue_ossfuzz_71218
+    __m128i ascii_first = _mm_cvtepu8_epi16(in);
+    __m128i ascii_second = _mm_cvtepu8_epi16(_mm_srli_si128(in, 8));
     if (big_endian) {
-      const __m256i swap256 = _mm256_setr_epi8(
-          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
-          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      ascii = _mm256_shuffle_epi8(ascii, swap256);
+      ascii_first = _mm_shuffle_epi8(ascii_first, swap);
+      ascii_second = _mm_shuffle_epi8(ascii_second, swap);
     }
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output), ascii);
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output), ascii_first);
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + 8),
+                     ascii_second);
     utf16_output += 12; // We wrote 12 16-bit characters.
     return 12;          // We consumed 12 bytes.
   }
-  if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
+  if (((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) {
     // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte
     // UTF-16 code units. There is probably a more efficient sequence, but the
     // following might do.
@@ -25955,11 +38646,12 @@ size_t convert_masked_utf8_to_utf16(const char *input,
     utf16_output += 4;
     return 12;
   }
+  /// We do not have a fast path available, so we fallback.
 
-  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
-      [input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
-      [input_utf8_end_of_code_point_mask][1];
+  const uint8_t idx =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
   if (idx < 64) {
     // SIX (6) input code-code units
     // this is a relatively easy scenario
@@ -25967,8 +38659,8 @@ size_t convert_masked_utf8_to_utf16(const char *input,
     // code code units spanning between 1 and 2 bytes each is 12 bytes. On
     // processors where pdep/pext is fast, we might be able to use a small
     // lookup table.
-    const __m128i sh = _mm_loadu_si128(
-        (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
     const __m128i perm = _mm_shuffle_epi8(in, sh);
     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
@@ -25976,12 +38668,11 @@ size_t convert_masked_utf8_to_utf16(const char *input,
     if (big_endian)
       composed = _mm_shuffle_epi8(composed, swap);
     _mm_storeu_si128((__m128i *)utf16_output, composed);
-    utf16_output += 6; // We wrote 12 bytes, 6 code points. There is a potential
-                       // overflow of 4 bytes.
+    utf16_output += 6; // We wrote 12 bytes, 6 code points.
   } else if (idx < 145) {
     // FOUR (4) input code-code units
-    const __m128i sh = _mm_loadu_si128(
-        (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
     const __m128i perm = _mm_shuffle_epi8(in, sh);
     const __m128i ascii =
         _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
@@ -25997,7 +38688,7 @@ size_t convert_masked_utf8_to_utf16(const char *input,
     if (big_endian)
       composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
     _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
-    utf16_output += 4; // Here we overflow by 8 bytes.
+    utf16_output += 4;
   } else if (idx < 209) {
     // TWO (2) input code-code units
     //////////////
@@ -26009,8 +38700,8 @@ size_t convert_masked_utf8_to_utf16(const char *input,
     // only leading bytes at least as large as 0xf0 generate surrogate pairs. We
     // do as at the cost of an extra mask.
     /////////////
-    const __m128i sh = _mm_loadu_si128(
-        (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
     const __m128i perm = _mm_shuffle_epi8(in, sh);
     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
     const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
@@ -26071,8 +38762,8 @@ size_t convert_masked_utf8_to_utf16(const char *input,
   }
   return consumed;
 }
-/* end file src/haswell/avx2_convert_utf8_to_utf16.cpp */
-/* begin file src/haswell/avx2_convert_utf8_to_utf32.cpp */
+/* end file src/westmere/sse_convert_utf8_to_utf16.cpp */
+/* begin file src/westmere/sse_convert_utf8_to_utf32.cpp */
 // depends on "tables/utf8_to_utf16_tables.h"
 
 // Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
@@ -26098,10 +38789,14 @@ size_t convert_masked_utf8_to_utf32(const char *input,
       utf8_end_of_code_point_mask & 0xfff;
   if (utf8_end_of_code_point_mask == 0xfff) {
     // We process the data in chunks of 12 bytes.
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output),
-                        _mm256_cvtepu8_epi32(in));
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8),
-                        _mm256_cvtepu8_epi32(_mm_srli_si128(in, 8)));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
+                     _mm_cvtepu8_epi32(in));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
+                     _mm_cvtepu8_epi32(_mm_srli_si128(in, 4)));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 8),
+                     _mm_cvtepu8_epi32(_mm_srli_si128(in, 8)));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 12),
+                     _mm_cvtepu8_epi32(_mm_srli_si128(in, 12)));
     utf32_output += 12; // We wrote 12 32-bit characters.
     return 12;          // We consumed 12 bytes.
   }
@@ -26115,9 +38810,11 @@ size_t convert_masked_utf8_to_utf32(const char *input,
     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
     const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    _mm256_storeu_si256((__m256i *)utf32_output,
-                        _mm256_cvtepu16_epi32(composed));
-    utf32_output += 8; // We wrote 16 bytes, 8 code points.
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
+                     _mm_cvtepu16_epi32(composed));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
+                     _mm_cvtepu16_epi32(_mm_srli_si128(composed, 8)));
+    utf32_output += 8; // We wrote 32 bytes, 8 code points.
     return 16;
   }
   if (input_utf8_end_of_code_point_mask == 0x924) {
@@ -26160,10 +38857,11 @@ size_t convert_masked_utf8_to_utf32(const char *input,
     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
     const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    _mm256_storeu_si256((__m256i *)utf32_output,
-                        _mm256_cvtepu16_epi32(composed));
-    utf32_output += 6; // We wrote 24 bytes, 6 code points. There is a potential
-    // overflow of 32 - 24 = 8 bytes.
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
+                     _mm_cvtepu16_epi32(composed));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
+                     _mm_cvtepu16_epi32(_mm_srli_si128(composed, 8)));
+    utf32_output += 6; // We wrote 12 bytes, 6 code points.
   } else if (idx < 145) {
     // FOUR (4) input code-code units
     const __m128i sh =
@@ -26201,46 +38899,99 @@ size_t convert_masked_utf8_to_utf32(const char *input,
         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
                      _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
     _mm_storeu_si128((__m128i *)utf32_output, composed);
-    utf32_output +=
-        3; // We wrote 3 * 4 bytes, there is a potential overflow of 4 bytes.
+    utf32_output += 3;
   } else {
     // here we know that there is an error but we do not handle errors
   }
   return consumed;
 }
-/* end file src/haswell/avx2_convert_utf8_to_utf32.cpp */
+/* end file src/westmere/sse_convert_utf8_to_utf32.cpp */
+/* begin file src/westmere/sse_convert_utf8_to_latin1.cpp */
+// depends on "tables/utf8_to_utf16_tables.h"
 
-/* begin file src/haswell/avx2_convert_utf16_to_latin1.cpp */
+// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+size_t convert_masked_utf8_to_latin1(const char *input,
+                                     uint64_t utf8_end_of_code_point_mask,
+                                     char *&latin1_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it
+  // is maybe beneficial to have fast paths that depend on branch prediction but
+  // have less latency. This results in more instructions but, potentially, also
+  // higher speeds.
+  //
+  const __m128i in = _mm_loadu_si128((__m128i *)input);
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask &
+      0xfff; // we are only processing 12 bytes in case it is not all ASCII
+  if (utf8_end_of_code_point_mask == 0xfff) {
+    // We process the data in chunks of 12 bytes.
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in);
+    latin1_output += 12; // We wrote 12 characters.
+    return 12;           // We consumed 12 bytes.
+  }
+  /// We do not have a fast path available, so we fallback.
+  const uint8_t idx =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+  // this indicates an invalid input:
+  if (idx >= 64) {
+    return consumed;
+  }
+  // Here we should have (idx < 64), if not, there is a bug in the validation or
+  // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
+  // we process SIX (6) input code-code units. The max length in bytes of six
+  // code code units spanning between 1 and 2 bytes each is 12 bytes. On
+  // processors where pdep/pext is fast, we might be able to use a small lookup
+  // table.
+  const __m128i sh =
+      _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+  const __m128i perm = _mm_shuffle_epi8(in, sh);
+  const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+  const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+  __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+  const __m128i latin1_packed = _mm_packus_epi16(composed, composed);
+  // writing 8 bytes even though we only care about the first 6 bytes.
+  // performance note: it would be faster to use _mm_storeu_si128, we should
+  // investigate.
+  _mm_storel_epi64((__m128i *)latin1_output, latin1_packed);
+  latin1_output += 6; // We wrote 6 bytes.
+  return consumed;
+}
+/* end file src/westmere/sse_convert_utf8_to_latin1.cpp */
+
+/* begin file src/westmere/sse_convert_utf16_to_latin1.cpp */
 template <endianness big_endian>
 std::pair<const char16_t *, char *>
-avx2_convert_utf16_to_latin1(const char16_t *buf, size_t len,
-                             char *latin1_output) {
+sse_convert_utf16_to_latin1(const char16_t *buf, size_t len,
+                            char *latin1_output) {
   const char16_t *end = buf + len;
-  while (end - buf >= 16) {
-    // Load 16 UTF-16 characters into 256-bit AVX2 register
-    __m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf));
+  while (end - buf >= 8) {
+    // Load 8 UTF-16 characters into 128-bit SSE register
+    __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i *>(buf));
 
     if (!match_system(big_endian)) {
-      const __m256i swap = _mm256_setr_epi8(
-          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
-          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      in = _mm256_shuffle_epi8(in, swap);
+      const __m128i swap =
+          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+      in = _mm_shuffle_epi8(in, swap);
     }
 
-    __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00);
-    if (_mm256_testz_si256(in, high_byte_mask)) {
+    __m128i high_byte_mask = _mm_set1_epi16((int16_t)0xFF00);
+    if (_mm_testz_si128(in, high_byte_mask)) {
       // Pack 16-bit characters into 8-bit and store in latin1_output
-      __m128i lo = _mm256_extractf128_si256(in, 0);
-      __m128i hi = _mm256_extractf128_si256(in, 1);
-      __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo);
-      __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi);
+      __m128i latin1_packed = _mm_packus_epi16(in, in);
       _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
-                       latin1_packed_lo);
-      _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8),
-                       latin1_packed_hi);
+                       latin1_packed);
       // Adjust pointers for next iteration
-      buf += 16;
-      latin1_output += 16;
+      buf += 8;
+      latin1_output += 8;
     } else {
       return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
     }
@@ -26250,54 +39001,47 @@ avx2_convert_utf16_to_latin1(const char16_t *buf, size_t len,
 
 template <endianness big_endian>
 std::pair<result, char *>
-avx2_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
-                                         char *latin1_output) {
+sse_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
+                                        char *latin1_output) {
   const char16_t *start = buf;
   const char16_t *end = buf + len;
-  while (end - buf >= 16) {
-    __m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf));
+  while (end - buf >= 8) {
+    __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i *>(buf));
 
     if (!match_system(big_endian)) {
-      const __m256i swap = _mm256_setr_epi8(
-          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
-          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      in = _mm256_shuffle_epi8(in, swap);
+      const __m128i swap =
+          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+      in = _mm_shuffle_epi8(in, swap);
     }
 
-    __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00);
-    if (_mm256_testz_si256(in, high_byte_mask)) {
-      __m128i lo = _mm256_extractf128_si256(in, 0);
-      __m128i hi = _mm256_extractf128_si256(in, 1);
-      __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo);
-      __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi);
+    __m128i high_byte_mask = _mm_set1_epi16((int16_t)0xFF00);
+    if (_mm_testz_si128(in, high_byte_mask)) {
+      __m128i latin1_packed = _mm_packus_epi16(in, in);
       _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
-                       latin1_packed_lo);
-      _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8),
-                       latin1_packed_hi);
-      buf += 16;
-      latin1_output += 16;
+                       latin1_packed);
+      buf += 8;
+      latin1_output += 8;
     } else {
       // Fallback to scalar code for handling errors
-      for (int k = 0; k < 16; k++) {
+      for (int k = 0; k < 8; k++) {
         uint16_t word = !match_system(big_endian)
                             ? scalar::utf16::swap_bytes(buf[k])
                             : buf[k];
         if (word <= 0xff) {
           *latin1_output++ = char(word);
         } else {
-          return std::make_pair(
-              result{error_code::TOO_LARGE, (size_t)(buf - start + k)},
-              latin1_output);
+          return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
+                                latin1_output);
         }
       }
-      buf += 16;
+      buf += 8;
     }
   } // while
-  return std::make_pair(result{error_code::SUCCESS, (size_t)(buf - start)},
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
                         latin1_output);
 }
-/* end file src/haswell/avx2_convert_utf16_to_latin1.cpp */
-/* begin file src/haswell/avx2_convert_utf16_to_utf8.cpp */
+/* end file src/westmere/sse_convert_utf16_to_latin1.cpp */
+/* begin file src/westmere/sse_convert_utf16_to_utf8.cpp */
 /*
     The vectorized algorithm works on single SSE register i.e., it
     loads eight 16-bit code units.
@@ -26353,117 +39097,91 @@ avx2_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
 */
 template <endianness big_endian>
 std::pair<const char16_t *, char *>
-avx2_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
+sse_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
+
   const char16_t *end = buf + len;
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
-  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
-  const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+
+  const __m128i v_0000 = _mm_setzero_si128();
+  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
   const size_t safety_margin =
       12; // to avoid overruns, see issue
           // https://github.com/simdutf/simdutf/issues/92
 
   while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+    __m128i in = _mm_loadu_si128((__m128i *)buf);
     if (big_endian) {
-      const __m256i swap = _mm256_setr_epi8(
-          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
-          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      in = _mm256_shuffle_epi8(in, swap);
+      const __m128i swap =
+          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+      in = _mm_shuffle_epi8(in, swap);
     }
     // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
-    const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
-    if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
-      // 1. pack the bytes
-      const __m128i utf8_packed = _mm_packus_epi16(
-          _mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
-      // 2. store (16 bytes)
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-      // 3. adjust pointers
-      buf += 16;
-      utf8_output += 16;
-      continue; // we are done for this round!
+    const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
+    if (_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
+      __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
+      if (big_endian) {
+        const __m128i swap =
+            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        nextin = _mm_shuffle_epi8(nextin, swap);
+      }
+      if (!_mm_testz_si128(nextin, v_ff80)) {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        const __m128i utf8_packed = _mm_packus_epi16(in, in);
+        // 2. store (16 bytes)
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 8;
+        utf8_output += 8;
+        in = nextin;
+      } else {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        const __m128i utf8_packed = _mm_packus_epi16(in, nextin);
+        // 2. store (16 bytes)
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 16;
+        utf8_output += 16;
+        continue; // we are done for this round!
+      }
     }
-    // no bits set above 7th bit
-    const __m256i one_byte_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
-
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m256i t0 = _mm256_slli_epi16(in, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m256i t2 = _mm256_and_si256(in, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m256i t3 = _mm256_or_si256(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m256i t4 = _mm256_or_si256(t3, v_c080);
 
-      // 2. merge ASCII and 2-byte codewords
-      const __m256i utf8_unpacked =
-          _mm256_blendv_epi8(t4, in, one_byte_bytemask);
-
-      // 3. prepare bitmask for 8-bit lookup
-      const uint32_t M0 = one_byte_bitmask & 0x55555555;
-      const uint32_t M1 = M0 >> 7;
-      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
-      // 4. pack the bytes
-
-      const uint8_t *row =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-      const uint8_t *row_2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
-                                                                       16)][0];
-
-      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
-      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
+    // no bits set above 7th bit
+    const __m128i one_byte_bytemask =
+        _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
+    const uint16_t one_byte_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
 
-      const __m256i utf8_packed = _mm256_shuffle_epi8(
-          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
-      // 5. store bytes
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_castsi256_si128(utf8_packed));
-      utf8_output += row[0];
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_extractf128_si256(utf8_packed, 1));
-      utf8_output += row_2[0];
+    // no bits set above 11th bit
+    const __m128i one_or_two_bytes_bytemask =
+        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
+    const uint16_t one_or_two_bytes_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
 
-      // 6. adjust pointers
-      buf += 16;
+    if (one_or_two_bytes_bitmask == 0xffff) {
+      internal::westmere::write_v_u16_11bits_to_utf8(
+          in, utf8_output, one_byte_bytemask, one_byte_bitmask);
+      buf += 8;
       continue;
     }
+
     // 1. Check if there are any surrogate word in the input chunk.
     //    We have also deal with situation when there is a surrogate word
     //    at the end of a chunk.
-    const __m256i surrogates_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+    const __m128i surrogates_bytemask =
+        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
 
     // bitmask = 0x0000 if there are no surrogates
     //         = 0xc000 if the last word is a surrogate
-    const uint32_t surrogates_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+    const uint16_t surrogates_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
     // It might seem like checking for surrogates_bitmask == 0xc000 could help.
     // However, it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x00000000) {
+    if (surrogates_bitmask == 0x0000) {
       // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
-      const __m256i dup_even = _mm256_setr_epi16(
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
 
       /* In this branch we handle three cases:
          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
@@ -26492,90 +39210,67 @@ avx2_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
        * t2 => [0ccc|cccc] [10cc|cccc]
        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
        */
-#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
+      const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
       // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+      const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
 
       // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m256i s0 = _mm256_srli_epi16(in, 4);
+      const __m128i s0 = _mm_srli_epi16(in, 4);
       // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
       // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
       // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
-      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
-                                             simdutf_vec(0b0100000000000000));
-      const __m256i s4 = _mm256_xor_si256(s3, m0);
+      const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
+                                          simdutf_vec(0b0100000000000000));
+      const __m128i s4 = _mm_xor_si128(s3, m0);
 #undef simdutf_vec
 
       // 4. expand code units 16-bit => 32-bit
-      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
 
       // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-      // Due to the wider registers, the following path is less likely to be
-      // useful.
-      /*if(mask == 0) {
+      const uint16_t mask =
+          (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
+      if (mask == 0) {
         // We only have three-byte code units. Use fast path.
-        const __m256i shuffle =
-      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
-      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
-      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
-      _mm256_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
+                                              15, 13, -1, -1, -1, -1);
+        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
         utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
         utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+        buf += 8;
         continue;
-      }*/
+      }
       const uint8_t mask0 = uint8_t(mask);
+
       const uint8_t *row0 =
           &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
       const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
-      const __m128i utf8_0 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
 
       const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
       const uint8_t *row1 =
           &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
       const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
-      const __m128i utf8_1 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-      const uint8_t *row2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
-      const __m128i utf8_2 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
-
-      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-      const uint8_t *row3 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
-      const __m128i utf8_3 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
 
       _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
       utf8_output += row0[0];
       _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
       utf8_output += row1[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
-      utf8_output += row2[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
-      utf8_output += row3[0];
-      buf += 16;
+
+      buf += 8;
       // surrogate pair(s) in a register
     } else {
       // Let us do a scalar fallback.
@@ -26617,6 +39312,7 @@ avx2_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
       buf += k;
     }
   } // while
+
   return std::make_pair(buf, utf8_output);
 }
 
@@ -26629,120 +39325,92 @@ avx2_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
 */
 template <endianness big_endian>
 std::pair<result, char *>
-avx2_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
-                                       char *utf8_output) {
+sse_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
+                                      char *utf8_output) {
   const char16_t *start = buf;
   const char16_t *end = buf + len;
 
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
-  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
-  const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+  const __m128i v_0000 = _mm_setzero_si128();
+  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
   const size_t safety_margin =
       12; // to avoid overruns, see issue
           // https://github.com/simdutf/simdutf/issues/92
 
   while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+    __m128i in = _mm_loadu_si128((__m128i *)buf);
     if (big_endian) {
-      const __m256i swap = _mm256_setr_epi8(
-          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
-          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      in = _mm256_shuffle_epi8(in, swap);
+      const __m128i swap =
+          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+      in = _mm_shuffle_epi8(in, swap);
     }
     // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
-    const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
-    if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
-      // 1. pack the bytes
-      const __m128i utf8_packed = _mm_packus_epi16(
-          _mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
-      // 2. store (16 bytes)
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-      // 3. adjust pointers
-      buf += 16;
-      utf8_output += 16;
-      continue; // we are done for this round!
+    const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
+    if (_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
+      __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
+      if (big_endian) {
+        const __m128i swap =
+            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        nextin = _mm_shuffle_epi8(nextin, swap);
+      }
+      if (!_mm_testz_si128(nextin, v_ff80)) {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        const __m128i utf8_packed = _mm_packus_epi16(in, in);
+        // 2. store (16 bytes)
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 8;
+        utf8_output += 8;
+        in = nextin;
+      } else {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        const __m128i utf8_packed = _mm_packus_epi16(in, nextin);
+        // 2. store (16 bytes)
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 16;
+        utf8_output += 16;
+        continue; // we are done for this round!
+      }
     }
+
     // no bits set above 7th bit
-    const __m256i one_byte_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+    const __m128i one_byte_bytemask =
+        _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
+    const uint16_t one_byte_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
 
     // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
-
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m256i t0 = _mm256_slli_epi16(in, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m256i t2 = _mm256_and_si256(in, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m256i t3 = _mm256_or_si256(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m256i t4 = _mm256_or_si256(t3, v_c080);
-
-      // 2. merge ASCII and 2-byte codewords
-      const __m256i utf8_unpacked =
-          _mm256_blendv_epi8(t4, in, one_byte_bytemask);
-
-      // 3. prepare bitmask for 8-bit lookup
-      const uint32_t M0 = one_byte_bitmask & 0x55555555;
-      const uint32_t M1 = M0 >> 7;
-      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
-      // 4. pack the bytes
-
-      const uint8_t *row =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-      const uint8_t *row_2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
-                                                                       16)][0];
-
-      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
-      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
-
-      const __m256i utf8_packed = _mm256_shuffle_epi8(
-          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
-      // 5. store bytes
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_castsi256_si128(utf8_packed));
-      utf8_output += row[0];
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_extractf128_si256(utf8_packed, 1));
-      utf8_output += row_2[0];
+    const __m128i one_or_two_bytes_bytemask =
+        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
+    const uint16_t one_or_two_bytes_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
 
-      // 6. adjust pointers
-      buf += 16;
+    if (one_or_two_bytes_bitmask == 0xffff) {
+      internal::westmere::write_v_u16_11bits_to_utf8(
+          in, utf8_output, one_byte_bytemask, one_byte_bitmask);
+      buf += 8;
       continue;
     }
+
     // 1. Check if there are any surrogate word in the input chunk.
     //    We have also deal with situation when there is a surrogate word
     //    at the end of a chunk.
-    const __m256i surrogates_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+    const __m128i surrogates_bytemask =
+        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
 
     // bitmask = 0x0000 if there are no surrogates
     //         = 0xc000 if the last word is a surrogate
-    const uint32_t surrogates_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+    const uint16_t surrogates_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
     // It might seem like checking for surrogates_bitmask == 0xc000 could help.
     // However, it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x00000000) {
+    if (surrogates_bitmask == 0x0000) {
       // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
-      const __m256i dup_even = _mm256_setr_epi16(
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
 
       /* In this branch we handle three cases:
          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
@@ -26771,90 +39439,67 @@ avx2_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
        * t2 => [0ccc|cccc] [10cc|cccc]
        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
        */
-#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
+      const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
       // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+      const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
 
       // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m256i s0 = _mm256_srli_epi16(in, 4);
+      const __m128i s0 = _mm_srli_epi16(in, 4);
       // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
       // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
       // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
-      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
-                                             simdutf_vec(0b0100000000000000));
-      const __m256i s4 = _mm256_xor_si256(s3, m0);
+      const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
+                                          simdutf_vec(0b0100000000000000));
+      const __m128i s4 = _mm_xor_si128(s3, m0);
 #undef simdutf_vec
 
       // 4. expand code units 16-bit => 32-bit
-      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
 
       // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-      // Due to the wider registers, the following path is less likely to be
-      // useful.
-      /*if(mask == 0) {
+      const uint16_t mask =
+          (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
+      if (mask == 0) {
         // We only have three-byte code units. Use fast path.
-        const __m256i shuffle =
-      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
-      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
-      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
-      _mm256_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
+                                              15, 13, -1, -1, -1, -1);
+        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
         utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
         utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+        buf += 8;
         continue;
-      }*/
+      }
       const uint8_t mask0 = uint8_t(mask);
+
       const uint8_t *row0 =
           &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
       const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
-      const __m128i utf8_0 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
 
       const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
       const uint8_t *row1 =
           &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
       const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
-      const __m128i utf8_1 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-      const uint8_t *row2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
-      const __m128i utf8_2 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
-
-      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-      const uint8_t *row3 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
-      const __m128i utf8_3 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
 
       _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
       utf8_output += row0[0];
       _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
       utf8_output += row1[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
-      utf8_output += row2[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
-      utf8_output += row3[0];
-      buf += 16;
+
+      buf += 8;
       // surrogate pair(s) in a register
     } else {
       // Let us do a scalar fallback.
@@ -26898,10 +39543,11 @@ avx2_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
       buf += k;
     }
   } // while
+
   return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
-/* end file src/haswell/avx2_convert_utf16_to_utf8.cpp */
-/* begin file src/haswell/avx2_convert_utf16_to_utf32.cpp */
+/* end file src/westmere/sse_convert_utf16_to_utf8.cpp */
+/* begin file src/westmere/sse_convert_utf16_to_utf32.cpp */
 /*
     The vectorized algorithm works on single SSE register i.e., it
     loads eight 16-bit code units.
@@ -26910,14 +39556,14 @@ avx2_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
     1. an input register contains no surrogates and each value
        is in range 0x0000 .. 0x07ff.
     2. an input register contains no surrogates and values are
-       in range 0x0000 .. 0xffff.
+       is in range 0x0000 .. 0xffff.
     3. an input register contains surrogates --- i.e. codepoints
        can have 16 or 32 bits.
 
     Ad 1.
 
     When values are less than 0x0800, it means that a 16-bit code unit
-    can be converted into: 1) single UTF8 byte (when it is an ASCII
+    can be converted into: 1) single UTF8 byte (when it's an ASCII
     char) or 2) two UTF8 bytes.
 
     For this case we do only some shuffle to obtain these 2-byte
@@ -26952,48 +39598,47 @@ avx2_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
 */
 
 /*
-  Returns a pair: the first unprocessed byte from buf and utf32_output
+  Returns a pair: the first unprocessed byte from buf and utf8_output
   A scalar routing should carry on the conversion of the tail.
 */
 template <endianness big_endian>
 std::pair<const char16_t *, char32_t *>
-avx2_convert_utf16_to_utf32(const char16_t *buf, size_t len,
-                            char32_t *utf32_output) {
+sse_convert_utf16_to_utf32(const char16_t *buf, size_t len,
+                           char32_t *utf32_output) {
   const char16_t *end = buf + len;
-  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
-  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
 
-  while (end - buf >= 16) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
+
+  while (end - buf >= 8) {
+    __m128i in = _mm_loadu_si128((__m128i *)buf);
+
     if (big_endian) {
-      const __m256i swap = _mm256_setr_epi8(
-          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
-          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      in = _mm256_shuffle_epi8(in, swap);
+      const __m128i swap =
+          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+      in = _mm_shuffle_epi8(in, swap);
     }
 
     // 1. Check if there are any surrogate word in the input chunk.
     //    We have also deal with situation when there is a surrogate word
     //    at the end of a chunk.
-    const __m256i surrogates_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+    const __m128i surrogates_bytemask =
+        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
 
     // bitmask = 0x0000 if there are no surrogates
     //         = 0xc000 if the last word is a surrogate
-    const uint32_t surrogates_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+    const uint16_t surrogates_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
     // It might seem like checking for surrogates_bitmask == 0xc000 could help.
     // However, it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x00000000) {
-      // case: we extend all sixteen 16-bit code units to sixteen 32-bit code
-      // units
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output),
-                          _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
-      _mm256_storeu_si256(
-          reinterpret_cast<__m256i *>(utf32_output + 8),
-          _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1)));
-      utf32_output += 16;
-      buf += 16;
+    if (surrogates_bitmask == 0x0000) {
+      // case: no surrogate pair, extend 16-bit code units to 32-bit code units
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
+                       _mm_cvtepu16_epi32(in));
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
+                       _mm_cvtepu16_epi32(_mm_srli_si128(in, 8)));
+      utf32_output += 8;
+      buf += 8;
       // surrogate pair(s) in a register
     } else {
       // Let us do a scalar fallback.
@@ -27007,7 +39652,6 @@ avx2_convert_utf16_to_utf32(const char16_t *buf, size_t len,
       for (; k < forward; k++) {
         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
         if ((word & 0xF800) != 0xD800) {
-          // No surrogate pair
           *utf32_output++ = char32_t(word);
         } else {
           // must be a surrogate pair
@@ -27038,44 +39682,43 @@ avx2_convert_utf16_to_utf32(const char16_t *buf, size_t len,
 */
 template <endianness big_endian>
 std::pair<result, char32_t *>
-avx2_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
-                                        char32_t *utf32_output) {
+sse_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
+                                       char32_t *utf32_output) {
   const char16_t *start = buf;
   const char16_t *end = buf + len;
-  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
-  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
 
-  while (end - buf >= 16) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
+
+  while (end - buf >= 8) {
+    __m128i in = _mm_loadu_si128((__m128i *)buf);
+
     if (big_endian) {
-      const __m256i swap = _mm256_setr_epi8(
-          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
-          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      in = _mm256_shuffle_epi8(in, swap);
+      const __m128i swap =
+          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+      in = _mm_shuffle_epi8(in, swap);
     }
 
     // 1. Check if there are any surrogate word in the input chunk.
     //    We have also deal with situation when there is a surrogate word
     //    at the end of a chunk.
-    const __m256i surrogates_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+    const __m128i surrogates_bytemask =
+        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
 
     // bitmask = 0x0000 if there are no surrogates
     //         = 0xc000 if the last word is a surrogate
-    const uint32_t surrogates_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+    const uint16_t surrogates_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
     // It might seem like checking for surrogates_bitmask == 0xc000 could help.
     // However, it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x00000000) {
-      // case: we extend all sixteen 16-bit code units to sixteen 32-bit code
-      // units
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output),
-                          _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
-      _mm256_storeu_si256(
-          reinterpret_cast<__m256i *>(utf32_output + 8),
-          _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1)));
-      utf32_output += 16;
-      buf += 16;
+    if (surrogates_bitmask == 0x0000) {
+      // case: no surrogate pair, extend 16-bit code units to 32-bit code units
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
+                       _mm_cvtepu16_epi32(in));
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
+                       _mm_cvtepu16_epi32(_mm_srli_si128(in, 8)));
+      utf32_output += 8;
+      buf += 8;
       // surrogate pair(s) in a register
     } else {
       // Let us do a scalar fallback.
@@ -27089,7 +39732,6 @@ avx2_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
       for (; k < forward; k++) {
         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
         if ((word & 0xF800) != 0xD800) {
-          // No surrogate pair
           *utf32_output++ = char32_t(word);
         } else {
           // must be a surrogate pair
@@ -27112,229 +39754,290 @@ avx2_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
   } // while
   return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
 }
-/* end file src/haswell/avx2_convert_utf16_to_utf32.cpp */
+/* end file src/westmere/sse_convert_utf16_to_utf32.cpp */
 
-/* begin file src/haswell/avx2_convert_utf32_to_latin1.cpp */
+/* begin file src/westmere/sse_convert_utf32_to_latin1.cpp */
 std::pair<const char32_t *, char *>
-avx2_convert_utf32_to_latin1(const char32_t *buf, size_t len,
-                             char *latin1_output) {
-  const size_t rounded_len =
-      len & ~0x1F; // Round down to nearest multiple of 32
-
-  __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00);
+sse_convert_utf32_to_latin1(const char32_t *buf, size_t len,
+                            char *latin1_output) {
+  const size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
 
-  __m256i shufmask = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                     -1, 12, 8, 4, 0, -1, -1, -1, -1, -1, -1,
-                                     -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
+  __m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00);
+  __m128i shufmask =
+      _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
 
   for (size_t i = 0; i < rounded_len; i += 16) {
-    __m256i in1 = _mm256_loadu_si256((__m256i *)buf);
-    __m256i in2 = _mm256_loadu_si256((__m256i *)(buf + 8));
+    __m128i in1 = _mm_loadu_si128((__m128i *)buf);
+    __m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4));
+    __m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8));
+    __m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12));
 
-    __m256i check_combined = _mm256_or_si256(in1, in2);
+    __m128i check_combined = _mm_or_si128(in1, in2);
+    check_combined = _mm_or_si128(check_combined, in3);
+    check_combined = _mm_or_si128(check_combined, in4);
 
-    if (!_mm256_testz_si256(check_combined, high_bytes_mask)) {
+    if (!_mm_testz_si128(check_combined, high_bytes_mask)) {
       return std::make_pair(nullptr, latin1_output);
     }
-
-    // Turn UTF32 bytes into latin 1 bytes
-    __m256i shuffled1 = _mm256_shuffle_epi8(in1, shufmask);
-    __m256i shuffled2 = _mm256_shuffle_epi8(in2, shufmask);
-
-    // move Latin1 bytes to their correct spot
-    __m256i idx1 = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, 4, 0);
-    __m256i idx2 = _mm256_set_epi32(-1, -1, -1, -1, 4, 0, -1, -1);
-    __m256i reshuffled1 = _mm256_permutevar8x32_epi32(shuffled1, idx1);
-    __m256i reshuffled2 = _mm256_permutevar8x32_epi32(shuffled2, idx2);
-
-    __m256i result = _mm256_or_si256(reshuffled1, reshuffled2);
-    _mm_storeu_si128((__m128i *)latin1_output, _mm256_castsi256_si128(result));
-
+    __m128i pack1 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in1, shufmask),
+                                       _mm_shuffle_epi8(in2, shufmask));
+    __m128i pack2 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in3, shufmask),
+                                       _mm_shuffle_epi8(in4, shufmask));
+    __m128i pack = _mm_unpacklo_epi64(pack1, pack2);
+    _mm_storeu_si128((__m128i *)latin1_output, pack);
     latin1_output += 16;
     buf += 16;
   }
 
   return std::make_pair(buf, latin1_output);
 }
-std::pair<result, char *>
-avx2_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
-                                         char *latin1_output) {
-  const size_t rounded_len =
-      len & ~0x1F; // Round down to nearest multiple of 32
 
-  __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00);
-  __m256i shufmask = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                     -1, 12, 8, 4, 0, -1, -1, -1, -1, -1, -1,
-                                     -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
+std::pair<result, char *>
+sse_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+                                        char *latin1_output) {
+  const char32_t *start = buf;
+  const size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
 
-  const char32_t *start = buf;
+  __m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00);
+  __m128i shufmask =
+      _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
 
   for (size_t i = 0; i < rounded_len; i += 16) {
-    __m256i in1 = _mm256_loadu_si256((__m256i *)buf);
-    __m256i in2 = _mm256_loadu_si256((__m256i *)(buf + 8));
+    __m128i in1 = _mm_loadu_si128((__m128i *)buf);
+    __m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4));
+    __m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8));
+    __m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12));
 
-    __m256i check_combined = _mm256_or_si256(in1, in2);
+    __m128i check_combined = _mm_or_si128(in1, in2);
+    check_combined = _mm_or_si128(check_combined, in3);
+    check_combined = _mm_or_si128(check_combined, in4);
 
-    if (!_mm256_testz_si256(check_combined, high_bytes_mask)) {
+    if (!_mm_testz_si128(check_combined, high_bytes_mask)) {
       // Fallback to scalar code for handling errors
-      for (int k = 0; k < 8; k++) {
+      for (int k = 0; k < 16; k++) {
         char32_t codepoint = buf[k];
-        if (codepoint <= 0xFF) {
-          *latin1_output++ = static_cast<char>(codepoint);
+        if (codepoint <= 0xff) {
+          *latin1_output++ = char(codepoint);
         } else {
           return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
                                 latin1_output);
         }
       }
-      buf += 8;
-    } else {
-      __m256i shuffled1 = _mm256_shuffle_epi8(in1, shufmask);
-      __m256i shuffled2 = _mm256_shuffle_epi8(in2, shufmask);
-
-      __m256i idx1 = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, 4, 0);
-      __m256i idx2 = _mm256_set_epi32(-1, -1, -1, -1, 4, 0, -1, -1);
-      __m256i reshuffled1 = _mm256_permutevar8x32_epi32(shuffled1, idx1);
-      __m256i reshuffled2 = _mm256_permutevar8x32_epi32(shuffled2, idx2);
-
-      __m256i result = _mm256_or_si256(reshuffled1, reshuffled2);
-      _mm_storeu_si128((__m128i *)latin1_output,
-                       _mm256_castsi256_si128(result));
-
-      latin1_output += 16;
       buf += 16;
+      continue;
     }
+    __m128i pack1 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in1, shufmask),
+                                       _mm_shuffle_epi8(in2, shufmask));
+    __m128i pack2 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in3, shufmask),
+                                       _mm_shuffle_epi8(in4, shufmask));
+    __m128i pack = _mm_unpacklo_epi64(pack1, pack2);
+    _mm_storeu_si128((__m128i *)latin1_output, pack);
+    latin1_output += 16;
+    buf += 16;
   }
 
   return std::make_pair(result(error_code::SUCCESS, buf - start),
                         latin1_output);
 }
-/* end file src/haswell/avx2_convert_utf32_to_latin1.cpp */
-/* begin file src/haswell/avx2_convert_utf32_to_utf8.cpp */
+/* end file src/westmere/sse_convert_utf32_to_latin1.cpp */
+/* begin file src/westmere/sse_convert_utf32_to_utf8.cpp */
 std::pair<const char32_t *, char *>
-avx2_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
+sse_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
   const char32_t *end = buf + len;
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
-  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
-  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
-  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
-  __m256i running_max = _mm256_setzero_si256();
-  __m256i forbidden_bytemask = _mm256_setzero_si256();
 
+  const __m128i v_0000 = _mm_setzero_si128();              //__m128 = 128 bits
+  const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); // 1111 1000 0000
+                                                           // 0000
+  const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080); // 1100 0000 1000
+                                                           // 0000
+  const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); // 1111 1111 1000
+                                                           // 0000
+  const __m128i v_ffff0000 = _mm_set1_epi32(
+      (uint32_t)0xffff0000); // 1111 1111 1111 1111 0000 0000 0000 0000
+  const __m128i v_7fffffff = _mm_set1_epi32(
+      (uint32_t)0x7fffffff); // 0111 1111 1111 1111 1111 1111 1111 1111
+  __m128i running_max = _mm_setzero_si128();
+  __m128i forbidden_bytemask = _mm_setzero_si128();
   const size_t safety_margin =
       12; // to avoid overruns, see issue
           // https://github.com/simdutf/simdutf/issues/92
 
-  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
-    __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
-    running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
+  while (end - buf >=
+         std::ptrdiff_t(
+             16 + safety_margin)) { // buf is a char32_t pointer, each char32_t
+                                    // has 4 bytes or 32 bits, thus buf + 16 *
+                                    // char_32t = 512 bits = 64 bytes
+    // We load two 16 bytes registers for a total of 32 bytes or 16 characters.
+    __m128i in = _mm_loadu_si128((__m128i *)buf);
+    __m128i nextin = _mm_loadu_si128(
+        (__m128i *)buf + 1); // These two values can hold only 8 UTF32 chars
+    running_max = _mm_max_epu32(
+        _mm_max_epu32(in, running_max), // take element-wise max char32_t from
+                                        // in and running_max vector
+        nextin); // and take element-wise max element from nextin and
+                 // running_max vector
 
     // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
     // saturation
-    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
-                                        _mm256_and_si256(nextin, v_7fffffff));
-    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+    __m128i in_16 = _mm_packus_epi32(
+        _mm_and_si128(in, v_7fffffff),
+        _mm_and_si128(
+            nextin,
+            v_7fffffff)); // in this context pack the two __m128 into a single
+    // By ensuring the highest bit is set to 0(&v_7fffffff), we are making sure
+    // all values are interpreted as non-negative, or specifically, the values
+    // are within the range of valid Unicode code points. remember : having
+    // leading byte 0 means a positive number by the two complements system.
+    // Unicode is well beneath the range where you'll start getting issues so
+    // that's OK.
 
-    // Try to apply UTF-16 => UTF-8 routine on 256 bits
-    // (haswell/avx2_convert_utf16_to_utf8.cpp)
+    // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
 
-    if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
-      // 1. pack the bytes
-      const __m128i utf8_packed = _mm_packus_epi16(
-          _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
-      // 2. store (16 bytes)
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-      // 3. adjust pointers
-      buf += 16;
-      utf8_output += 16;
-      continue; // we are done for this round!
+    // Check for ASCII fast path
+
+    // ASCII fast path!!!!
+    // We eagerly load another 32 bytes, hoping that they will be ASCII too.
+    // The intuition is that we try to collect 16 ASCII characters which
+    // requires a total of 64 bytes of input. If we fail, we just pass thirdin
+    // and fourthin as our new inputs.
+    if (_mm_testz_si128(in_16, v_ff80)) { // if the first two blocks are ASCII
+      __m128i thirdin = _mm_loadu_si128((__m128i *)buf + 2);
+      __m128i fourthin = _mm_loadu_si128((__m128i *)buf + 3);
+      running_max = _mm_max_epu32(
+          _mm_max_epu32(thirdin, running_max),
+          fourthin); // take the running max of all 4 vectors thus far
+      __m128i nextin_16 = _mm_packus_epi32(
+          _mm_and_si128(thirdin, v_7fffffff),
+          _mm_and_si128(fourthin,
+                        v_7fffffff)); // pack into 1 vector, now you have two
+      if (!_mm_testz_si128(
+              nextin_16,
+              v_ff80)) { // checks if the second packed vector is ASCII, if not:
+        // 1. pack the bytes
+        // obviously suboptimal.
+        const __m128i utf8_packed = _mm_packus_epi16(
+            in_16, in_16); // creates two copy of in_16 in 1 vector
+        // 2. store (16 bytes)
+        _mm_storeu_si128((__m128i *)utf8_output,
+                         utf8_packed); // put them into the output
+        // 3. adjust pointers
+        buf += 8; // the char32_t buffer pointer goes up 8 char32_t chars* 32
+                  // bits =  256 bits
+        utf8_output +=
+            8; // same with output, e.g. lift the first two blocks alone.
+        // Proceed with next input
+        in_16 = nextin_16;
+        // We need to update in and nextin because they are used later.
+        in = thirdin;
+        nextin = fourthin;
+      } else {
+        // 1. pack the bytes
+        const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
+        // 2. store (16 bytes)
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 16;
+        utf8_output += 16;
+        continue; // we are done for this round!
+      }
     }
-    // no bits set above 7th bit
-    const __m256i one_byte_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+    // no bits set above 7th bit -- find out all the ASCII characters
+    const __m128i one_byte_bytemask =
+        _mm_cmpeq_epi16( // this takes four bytes at a time and compares:
+            _mm_and_si128(in_16, v_ff80), // the vector that get only the first
+                                          // 9 bits of each 16-bit/2-byte units
+            v_0000                        //
+        ); // they should be all zero if they are ASCII. E.g. ASCII in UTF32 is
+           // of format 0000 0000 0000 0XXX XXXX
+    // _mm_cmpeq_epi16 should now return a 1111 1111 1111 1111 for equals, and
+    // 0000 0000 0000 0000 if not for each 16-bit/2-byte units
+    const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(
+        one_byte_bytemask)); // collect the MSB from previous vector and put
+                             // them into uint16_t mas
 
     // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
+    const __m128i one_or_two_bytes_bytemask =
+        _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
+    const uint16_t one_or_two_bytes_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+
+    if (one_or_two_bytes_bitmask == 0xffff) {
+      // case: all code units either produce 1 or 2 UTF-8 bytes (at least one
+      // produces 2 bytes)
       // 1. prepare 2-byte values
       // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
       // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+      const __m128i v_1f00 =
+          _mm_set1_epi16((int16_t)0x1f00); // 0001 1111 0000 0000
+      const __m128i v_003f =
+          _mm_set1_epi16((int16_t)0x003f); // 0000 0000 0011 1111
 
       // t0 = [000a|aaaa|bbbb|bb00]
-      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+      const __m128i t0 = _mm_slli_epi16(in_16, 2); // shift packed vector by two
       // t1 = [000a|aaaa|0000|0000]
-      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+      const __m128i t1 =
+          _mm_and_si128(t0, v_1f00); // potentital first utf8 byte
       // t2 = [0000|0000|00bb|bbbb]
-      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+      const __m128i t2 =
+          _mm_and_si128(in_16, v_003f); // potential second utf8 byte
       // t3 = [000a|aaaa|00bb|bbbb]
-      const __m256i t3 = _mm256_or_si256(t1, t2);
+      const __m128i t3 =
+          _mm_or_si128(t1, t2); // first and second potential utf8 byte together
       // t4 = [110a|aaaa|10bb|bbbb]
-      const __m256i t4 = _mm256_or_si256(t3, v_c080);
+      const __m128i t4 = _mm_or_si128(
+          t3,
+          v_c080); // t3 | 1100 0000 1000 0000 = full potential 2-byte utf8 unit
 
       // 2. merge ASCII and 2-byte codewords
-      const __m256i utf8_unpacked =
-          _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+      const __m128i utf8_unpacked =
+          _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
 
       // 3. prepare bitmask for 8-bit lookup
-      const uint32_t M0 = one_byte_bitmask & 0x55555555;
-      const uint32_t M1 = M0 >> 7;
-      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+      //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h -
+      //    MSB, a - LSB)
+      const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
+      const uint16_t m1 =
+          static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
+      const uint8_t m2 =
+          static_cast<uint8_t>((m0 | m1) & 0xff); // m2 =         hdgcfbea
       // 4. pack the bytes
-
       const uint8_t *row =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-      const uint8_t *row_2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
-                                                                       16)][0];
-
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
       const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
-      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
+      const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
 
-      const __m256i utf8_packed = _mm256_shuffle_epi8(
-          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
       // 5. store bytes
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_castsi256_si128(utf8_packed));
-      utf8_output += row[0];
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_extractf128_si256(utf8_packed, 1));
-      utf8_output += row_2[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
 
       // 6. adjust pointers
-      buf += 16;
+      buf += 8;
+      utf8_output += row[0];
       continue;
     }
-    // Must check for overflow in packing
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
-        _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+
+    // Check for overflow in packing
+
+    const __m128i saturation_bytemask = _mm_cmpeq_epi32(
+        _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
     const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-    if (saturation_bitmask == 0xffffffff) {
+        static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+    if (saturation_bitmask == 0xffff) {
       // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
-      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
-      forbidden_bytemask = _mm256_or_si256(
-          forbidden_bytemask,
-          _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
+      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+      forbidden_bytemask =
+          _mm_or_si128(forbidden_bytemask,
+                       _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800));
 
-      const __m256i dup_even = _mm256_setr_epi16(
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
 
       /* In this branch we handle three cases:
-        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
         single UFT-8 byte
-        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
-        UTF-8 bytes
-        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
+        two UTF-8 bytes
+          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
         three UTF-8 bytes
 
         We expand the input word (16-bit) into two code units (32-bit), thus
@@ -27356,95 +40059,72 @@ avx2_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
        * t2 => [0ccc|cccc] [10cc|cccc]
        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
        */
-#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+      const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
       // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+      const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
 
       // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+      const __m128i s0 = _mm_srli_epi16(in_16, 4);
       // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
       // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
       // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
-      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
-                                             simdutf_vec(0b0100000000000000));
-      const __m256i s4 = _mm256_xor_si256(s3, m0);
+      const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
+                                          simdutf_vec(0b0100000000000000));
+      const __m128i s4 = _mm_xor_si128(s3, m0);
 #undef simdutf_vec
 
       // 4. expand code units 16-bit => 32-bit
-      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
 
       // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-      // Due to the wider registers, the following path is less likely to be
-      // useful.
-      /*if(mask == 0) {
+      const uint16_t mask =
+          (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
+      if (mask == 0) {
         // We only have three-byte code units. Use fast path.
-        const __m256i shuffle =
-      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
-      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
-      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
-      _mm256_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
+                                              15, 13, -1, -1, -1, -1);
+        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
         utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
         utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+        buf += 8;
         continue;
-      }*/
+      }
       const uint8_t mask0 = uint8_t(mask);
+
       const uint8_t *row0 =
           &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
       const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
-      const __m128i utf8_0 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
 
       const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
       const uint8_t *row1 =
           &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
       const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
-      const __m128i utf8_1 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-      const uint8_t *row2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
-      const __m128i utf8_2 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
-
-      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-      const uint8_t *row3 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
-      const __m128i utf8_3 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
 
       _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
       utf8_output += row0[0];
       _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
       utf8_output += row1[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
-      utf8_output += row2[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
-      utf8_output += row3[0];
-      buf += 16;
+
+      buf += 8;
     } else {
-      // case: at least one 32-bit word is larger than 0xFFFF <=> it will
-      // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
-      // wasteful to use scalar code, but being efficient with SIMD may require
-      // large, non-trivial tables?
+      // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=>
+      // will produce four UTF-8 bytes Let us do a scalar fallback. It may seem
+      // wasteful to use scalar code, but being efficient with SIMD in the
+      // presence of surrogate pairs may require non-trivial tables.
       size_t forward = 15;
       size_t k = 0;
       if (size_t(end - buf) < forward + 1) {
@@ -27452,19 +40132,19 @@ avx2_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
       }
       for (; k < forward; k++) {
         uint32_t word = buf[k];
-        if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+        if ((word & 0xFFFFFF80) == 0) {
           *utf8_output++ = char(word);
-        } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+        } else if ((word & 0xFFFFF800) == 0) {
           *utf8_output++ = char((word >> 6) | 0b11000000);
           *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+        } else if ((word & 0xFFFF0000) == 0) {
           if (word >= 0xD800 && word <= 0xDFFF) {
             return std::make_pair(nullptr, utf8_output);
           }
           *utf8_output++ = char((word >> 12) | 0b11100000);
           *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
           *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else { // 4-byte
+        } else {
           if (word > 0x10FFFF) {
             return std::make_pair(nullptr, utf8_output);
           }
@@ -27479,13 +40159,13 @@ avx2_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
   } // while
 
   // check for invalid input
-  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
-  if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(
-          _mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
+  const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
+  if (static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(
+          _mm_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffff) {
     return std::make_pair(nullptr, utf8_output);
   }
 
-  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+  if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
     return std::make_pair(nullptr, utf8_output);
   }
 
@@ -27493,145 +40173,141 @@ avx2_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
 }
 
 std::pair<result, char *>
-avx2_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
-                                       char *utf8_output) {
+sse_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
+                                      char *utf8_output) {
   const char32_t *end = buf + len;
   const char32_t *start = buf;
 
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
-  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
-  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
-  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
-  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+  const __m128i v_0000 = _mm_setzero_si128();
+  const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
+  const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080);
+  const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
+  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
+  const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff);
+  const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
 
   const size_t safety_margin =
       12; // to avoid overruns, see issue
           // https://github.com/simdutf/simdutf/issues/92
 
   while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
-    __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
+    // We load two 16 bytes registers for a total of 32 bytes or 8 characters.
+    __m128i in = _mm_loadu_si128((__m128i *)buf);
+    __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
     // Check for too large input
-    const __m256i max_input =
-        _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
-    if (static_cast<uint32_t>(_mm256_movemask_epi8(
-            _mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
+    __m128i max_input = _mm_max_epu32(_mm_max_epu32(in, nextin), v_10ffff);
+    if (static_cast<uint16_t>(_mm_movemask_epi8(
+            _mm_cmpeq_epi32(max_input, v_10ffff))) != 0xffff) {
       return std::make_pair(result(error_code::TOO_LARGE, buf - start),
                             utf8_output);
     }
 
     // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
     // saturation
-    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
-                                        _mm256_and_si256(nextin, v_7fffffff));
-    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+    __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff),
+                                     _mm_and_si128(nextin, v_7fffffff));
 
-    // Try to apply UTF-16 => UTF-8 routine on 256 bits
-    // (haswell/avx2_convert_utf16_to_utf8.cpp)
+    // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
 
-    if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+    // Check for ASCII fast path
+    if (_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!!
       // 1. pack the bytes
-      const __m128i utf8_packed = _mm_packus_epi16(
-          _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+      // obviously suboptimal.
+      const __m128i utf8_packed = _mm_packus_epi16(in_16, in_16);
       // 2. store (16 bytes)
       _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
       // 3. adjust pointers
-      buf += 16;
-      utf8_output += 16;
-      continue; // we are done for this round!
+      buf += 8;
+      utf8_output += 8;
+      continue;
     }
+
     // no bits set above 7th bit
-    const __m256i one_byte_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+    const __m128i one_byte_bytemask =
+        _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000);
+    const uint16_t one_byte_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
 
     // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
+    const __m128i one_or_two_bytes_bytemask =
+        _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
+    const uint16_t one_or_two_bytes_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+
+    if (one_or_two_bytes_bitmask == 0xffff) {
+      // case: all code units either produce 1 or 2 UTF-8 bytes (at least one
+      // produces 2 bytes)
       // 1. prepare 2-byte values
       // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
       // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+      const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
+      const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
 
       // t0 = [000a|aaaa|bbbb|bb00]
-      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+      const __m128i t0 = _mm_slli_epi16(in_16, 2);
       // t1 = [000a|aaaa|0000|0000]
-      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+      const __m128i t1 = _mm_and_si128(t0, v_1f00);
       // t2 = [0000|0000|00bb|bbbb]
-      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+      const __m128i t2 = _mm_and_si128(in_16, v_003f);
       // t3 = [000a|aaaa|00bb|bbbb]
-      const __m256i t3 = _mm256_or_si256(t1, t2);
+      const __m128i t3 = _mm_or_si128(t1, t2);
       // t4 = [110a|aaaa|10bb|bbbb]
-      const __m256i t4 = _mm256_or_si256(t3, v_c080);
+      const __m128i t4 = _mm_or_si128(t3, v_c080);
 
       // 2. merge ASCII and 2-byte codewords
-      const __m256i utf8_unpacked =
-          _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+      const __m128i utf8_unpacked =
+          _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
 
       // 3. prepare bitmask for 8-bit lookup
-      const uint32_t M0 = one_byte_bitmask & 0x55555555;
-      const uint32_t M1 = M0 >> 7;
-      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+      //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h -
+      //    MSB, a - LSB)
+      const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
+      const uint16_t m1 =
+          static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
+      const uint8_t m2 =
+          static_cast<uint8_t>((m0 | m1) & 0xff); // m2 =         hdgcfbea
       // 4. pack the bytes
-
       const uint8_t *row =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-      const uint8_t *row_2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
-                                                                       16)][0];
-
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
       const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
-      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
-
-      const __m256i utf8_packed = _mm256_shuffle_epi8(
-          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
-      // 5. store bytes
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_castsi256_si128(utf8_packed));
-      utf8_output += row[0];
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_extractf128_si256(utf8_packed, 1));
-      utf8_output += row_2[0];
+      const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
+
+      // 5. store bytes
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
 
       // 6. adjust pointers
-      buf += 16;
+      buf += 8;
+      utf8_output += row[0];
       continue;
     }
-    // Must check for overflow in packing
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
-        _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+
+    // Check for overflow in packing
+    const __m128i saturation_bytemask = _mm_cmpeq_epi32(
+        _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
     const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-    if (saturation_bitmask == 0xffffffff) {
+        static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+
+    if (saturation_bitmask == 0xffff) {
       // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
 
       // Check for illegal surrogate code units
-      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
-      const __m256i forbidden_bytemask =
-          _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
-          0x0) {
+      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+      const __m128i forbidden_bytemask =
+          _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800);
+      if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
         return std::make_pair(result(error_code::SURROGATE, buf - start),
                               utf8_output);
       }
 
-      const __m256i dup_even = _mm256_setr_epi16(
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
 
       /* In this branch we handle three cases:
-        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
         single UFT-8 byte
-        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
-        UTF-8 bytes
-        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
+        two UTF-8 bytes
+          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
         three UTF-8 bytes
 
         We expand the input word (16-bit) into two code units (32-bit), thus
@@ -27653,95 +40329,72 @@ avx2_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
        * t2 => [0ccc|cccc] [10cc|cccc]
        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
        */
-#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+      const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
       // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+      const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
 
       // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+      const __m128i s0 = _mm_srli_epi16(in_16, 4);
       // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
       // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
       // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
-      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
-                                             simdutf_vec(0b0100000000000000));
-      const __m256i s4 = _mm256_xor_si256(s3, m0);
+      const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
+                                          simdutf_vec(0b0100000000000000));
+      const __m128i s4 = _mm_xor_si128(s3, m0);
 #undef simdutf_vec
 
       // 4. expand code units 16-bit => 32-bit
-      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
 
       // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-      // Due to the wider registers, the following path is less likely to be
-      // useful.
-      /*if(mask == 0) {
+      const uint16_t mask =
+          (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
+      if (mask == 0) {
         // We only have three-byte code units. Use fast path.
-        const __m256i shuffle =
-      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
-      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
-      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
-      _mm256_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
+                                              15, 13, -1, -1, -1, -1);
+        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
         utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
         utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+        buf += 8;
         continue;
-      }*/
+      }
       const uint8_t mask0 = uint8_t(mask);
+
       const uint8_t *row0 =
           &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
       const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
-      const __m128i utf8_0 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
 
       const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
       const uint8_t *row1 =
           &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
       const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
-      const __m128i utf8_1 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-      const uint8_t *row2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
-      const __m128i utf8_2 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
-
-      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-      const uint8_t *row3 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
-      const __m128i utf8_3 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
 
       _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
       utf8_output += row0[0];
       _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
       utf8_output += row1[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
-      utf8_output += row2[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
-      utf8_output += row3[0];
-      buf += 16;
+
+      buf += 8;
     } else {
-      // case: at least one 32-bit word is larger than 0xFFFF <=> it will
-      // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
-      // wasteful to use scalar code, but being efficient with SIMD may require
-      // large, non-trivial tables?
+      // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=>
+      // will produce four UTF-8 bytes Let us do a scalar fallback. It may seem
+      // wasteful to use scalar code, but being efficient with SIMD in the
+      // presence of surrogate pairs may require non-trivial tables.
       size_t forward = 15;
       size_t k = 0;
       if (size_t(end - buf) < forward + 1) {
@@ -27749,12 +40402,12 @@ avx2_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
       }
       for (; k < forward; k++) {
         uint32_t word = buf[k];
-        if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+        if ((word & 0xFFFFFF80) == 0) {
           *utf8_output++ = char(word);
-        } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+        } else if ((word & 0xFFFFF800) == 0) {
           *utf8_output++ = char((word >> 6) | 0b11000000);
           *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+        } else if ((word & 0xFFFF0000) == 0) {
           if (word >= 0xD800 && word <= 0xDFFF) {
             return std::make_pair(
                 result(error_code::SURROGATE, buf - start + k), utf8_output);
@@ -27762,7 +40415,7 @@ avx2_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
           *utf8_output++ = char((word >> 12) | 0b11100000);
           *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
           *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else { // 4-byte
+        } else {
           if (word > 0x10FFFF) {
             return std::make_pair(
                 result(error_code::TOO_LARGE, buf - start + k), utf8_output);
@@ -27776,48 +40429,46 @@ avx2_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
       buf += k;
     }
   } // while
-
   return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
-/* end file src/haswell/avx2_convert_utf32_to_utf8.cpp */
-/* begin file src/haswell/avx2_convert_utf32_to_utf16.cpp */
+/* end file src/westmere/sse_convert_utf32_to_utf8.cpp */
+/* begin file src/westmere/sse_convert_utf32_to_utf16.cpp */
 template <endianness big_endian>
 std::pair<const char32_t *, char16_t *>
-avx2_convert_utf32_to_utf16(const char32_t *buf, size_t len,
-                            char16_t *utf16_output) {
-  const char32_t *end = buf + len;
-
-  const size_t safety_margin =
-      12; // to avoid overruns, see issue
-          // https://github.com/simdutf/simdutf/issues/92
-  __m256i forbidden_bytemask = _mm256_setzero_si256();
+sse_convert_utf32_to_utf16(const char32_t *buf, size_t len,
+                           char16_t *utf16_output) {
 
-  while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+  const char32_t *end = buf + len;
 
-    const __m256i v_00000000 = _mm256_setzero_si256();
-    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+  const __m128i v_0000 = _mm_setzero_si128();
+  const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
+  __m128i forbidden_bytemask = _mm_setzero_si128();
 
-    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
-    const __m256i saturation_bytemask =
-        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+  while (end - buf >= 8) {
+    __m128i in = _mm_loadu_si128((__m128i *)buf);
+    __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
+    const __m128i saturation_bytemask = _mm_cmpeq_epi32(
+        _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
     const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+        static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
 
-    if (saturation_bitmask == 0xffffffff) {
-      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
-      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
-      forbidden_bytemask = _mm256_or_si256(
+    // Check if no bits set above 16th
+    if (saturation_bitmask == 0xffff) {
+      // Pack UTF-32 to UTF-16
+      __m128i utf16_packed = _mm_packus_epi32(in, nextin);
+
+      const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
+      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+      forbidden_bytemask = _mm_or_si128(
           forbidden_bytemask,
-          _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
+          _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800));
 
-      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
-                                              _mm256_extractf128_si256(in, 1));
       if (big_endian) {
         const __m128i swap =
             _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
         utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
       }
+
       _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
       utf16_output += 8;
       buf += 8;
@@ -27861,7 +40512,7 @@ avx2_convert_utf32_to_utf16(const char32_t *buf, size_t len,
   }
 
   // check for invalid input
-  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+  if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
     return std::make_pair(nullptr, utf16_output);
   }
 
@@ -27870,45 +40521,42 @@ avx2_convert_utf32_to_utf16(const char32_t *buf, size_t len,
 
 template <endianness big_endian>
 std::pair<result, char16_t *>
-avx2_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
-                                        char16_t *utf16_output) {
+sse_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
+                                       char16_t *utf16_output) {
   const char32_t *start = buf;
   const char32_t *end = buf + len;
 
-  const size_t safety_margin =
-      12; // to avoid overruns, see issue
-          // https://github.com/simdutf/simdutf/issues/92
-
-  while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
-
-    const __m256i v_00000000 = _mm256_setzero_si256();
-    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+  const __m128i v_0000 = _mm_setzero_si128();
+  const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
 
-    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
-    const __m256i saturation_bytemask =
-        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+  while (end - buf >= 8) {
+    __m128i in = _mm_loadu_si128((__m128i *)buf);
+    __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
+    const __m128i saturation_bytemask = _mm_cmpeq_epi32(
+        _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
     const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+        static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
 
-    if (saturation_bitmask == 0xffffffff) {
-      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
-      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
-      const __m256i forbidden_bytemask =
-          _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
-          0x0) {
+    // Check if no bits set above 16th
+    if (saturation_bitmask == 0xffff) {
+      // Pack UTF-32 to UTF-16
+      __m128i utf16_packed = _mm_packus_epi32(in, nextin);
+
+      const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
+      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+      const __m128i forbidden_bytemask =
+          _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800);
+      if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
         return std::make_pair(result(error_code::SURROGATE, buf - start),
                               utf16_output);
       }
 
-      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
-                                              _mm256_extractf128_si256(in, 1));
       if (big_endian) {
         const __m128i swap =
             _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
         utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
       }
+
       _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
       utf16_output += 8;
       buf += 8;
@@ -27955,72 +40603,8 @@ avx2_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
 
   return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
 }
-/* end file src/haswell/avx2_convert_utf32_to_utf16.cpp */
-
-/* begin file src/haswell/avx2_convert_utf8_to_latin1.cpp */
-// depends on "tables/utf8_to_utf16_tables.h"
-
-// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the
-// end of the code points. Only the least significant 12 bits of the mask
-// are accessed.
-// It returns how many bytes were consumed (up to 12).
-size_t convert_masked_utf8_to_latin1(const char *input,
-                                     uint64_t utf8_end_of_code_point_mask,
-                                     char *&latin1_output) {
-  // we use an approach where we try to process up to 12 input bytes.
-  // Why 12 input bytes and not 16? Because we are concerned with the size of
-  // the lookup tables. Also 12 is nicely divisible by two and three.
-  //
-  //
-  // Optimization note: our main path below is load-latency dependent. Thus it
-  // is maybe beneficial to have fast paths that depend on branch prediction but
-  // have less latency. This results in more instructions but, potentially, also
-  // higher speeds.
-  //
-  const __m128i in = _mm_loadu_si128((__m128i *)input);
-
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask &
-      0xfff; // we are only processing 12 bytes in case it is not all ASCII
-
-  if (utf8_end_of_code_point_mask == 0xfff) {
-    // We process the data in chunks of 12 bytes.
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in);
-    latin1_output += 12; // We wrote 12 characters.
-    return 12;           // We consumed 1 bytes.
-  }
-  /// We do not have a fast path available, so we fallback.
-  const uint8_t idx =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
-  // this indicates an invalid input:
-  if (idx >= 64) {
-    return consumed;
-  }
-  // Here we should have (idx < 64), if not, there is a bug in the validation or
-  // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
-  // we process SIX (6) input code-code units. The max length in bytes of six
-  // code code units spanning between 1 and 2 bytes each is 12 bytes. On
-  // processors where pdep/pext is fast, we might be able to use a small lookup
-  // table.
-  const __m128i sh =
-      _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-  const __m128i perm = _mm_shuffle_epi8(in, sh);
-  const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-  const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-  __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-  const __m128i latin1_packed = _mm_packus_epi16(composed, composed);
-  // writing 8 bytes even though we only care about the first 6 bytes.
-  // performance note: it would be faster to use _mm_storeu_si128, we should
-  // investigate.
-  _mm_storel_epi64((__m128i *)latin1_output, latin1_packed);
-  latin1_output += 6; // We wrote 6 bytes.
-  return consumed;
-}
-/* end file src/haswell/avx2_convert_utf8_to_latin1.cpp */
-
-/* begin file src/haswell/avx2_base64.cpp */
+/* end file src/westmere/sse_convert_utf32_to_utf16.cpp */
+/* begin file src/westmere/sse_base64.cpp */
 /**
  * References and further reading:
  *
@@ -28048,151 +40632,155 @@ size_t convert_masked_utf8_to_latin1(const char *input,
  * Nick Kopp. 2013. Base64 Encoding on a GPU.
  * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
  */
-
-template <bool base64_url>
-simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) {
+template <bool base64_url> __m128i lookup_pshufb_improved(const __m128i input) {
   // credit: Wojciech Muła
-  __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
-  const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
-  result =
-      _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
-  __m256i shift_LUT;
-  if (base64_url) {
-    shift_LUT = _mm256_setr_epi8(
-        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0,
+  // reduce  0..51 -> 0
+  //        52..61 -> 1 .. 10
+  //            62 -> 11
+  //            63 -> 12
+  __m128i result = _mm_subs_epu8(input, _mm_set1_epi8(51));
 
-        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
-  } else {
-    shift_LUT = _mm256_setr_epi8(
-        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0,
+  // distinguish between ranges 0..25 and 26..51:
+  //         0 .. 25 -> remains 0
+  //        26 .. 51 -> becomes 13
+  const __m128i less = _mm_cmpgt_epi8(_mm_set1_epi8(26), input);
+  result = _mm_or_si128(result, _mm_and_si128(less, _mm_set1_epi8(13)));
 
-        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
+  __m128i shift_LUT;
+  if (base64_url) {
+    shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+                              '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+                              '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
+  } else {
+    shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+                              '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+                              '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
   }
 
-  result = _mm256_shuffle_epi8(shift_LUT, result);
-  return _mm256_add_epi8(result, input);
+  // read shift
+  result = _mm_shuffle_epi8(shift_LUT, result);
+
+  return _mm_add_epi8(result, input);
 }
 
 template <bool isbase64url>
 size_t encode_base64(char *dst, const char *src, size_t srclen,
                      base64_options options) {
   // credit: Wojciech Muła
+  // SSE (lookup: pshufb improved unrolled)
   const uint8_t *input = (const uint8_t *)src;
 
   uint8_t *out = (uint8_t *)dst;
-  const __m256i shuf =
-      _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
+  const __m128i shuf =
+      _mm_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
 
-                      10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
   size_t i = 0;
-  for (; i + 100 <= srclen; i += 96) {
-    const __m128i lo0 = _mm_loadu_si128(
+  for (; i + 52 <= srclen; i += 48) {
+    __m128i in0 = _mm_loadu_si128(
         reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
-    const __m128i hi0 = _mm_loadu_si128(
+    __m128i in1 = _mm_loadu_si128(
         reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
-    const __m128i lo1 = _mm_loadu_si128(
+    __m128i in2 = _mm_loadu_si128(
         reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
-    const __m128i hi1 = _mm_loadu_si128(
+    __m128i in3 = _mm_loadu_si128(
         reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
-    const __m128i lo2 = _mm_loadu_si128(
-        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4));
-    const __m128i hi2 = _mm_loadu_si128(
-        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5));
-    const __m128i lo3 = _mm_loadu_si128(
-        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6));
-    const __m128i hi3 = _mm_loadu_si128(
-        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7));
 
-    __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
-    __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
-    __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
-    __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
+    in0 = _mm_shuffle_epi8(in0, shuf);
+    in1 = _mm_shuffle_epi8(in1, shuf);
+    in2 = _mm_shuffle_epi8(in2, shuf);
+    in3 = _mm_shuffle_epi8(in3, shuf);
 
-    const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
-    const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
-    const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
-    const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
+    const __m128i t0_0 = _mm_and_si128(in0, _mm_set1_epi32(0x0fc0fc00));
+    const __m128i t0_1 = _mm_and_si128(in1, _mm_set1_epi32(0x0fc0fc00));
+    const __m128i t0_2 = _mm_and_si128(in2, _mm_set1_epi32(0x0fc0fc00));
+    const __m128i t0_3 = _mm_and_si128(in3, _mm_set1_epi32(0x0fc0fc00));
 
-    const __m256i t1_0 =
-        _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
-    const __m256i t1_1 =
-        _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
-    const __m256i t1_2 =
-        _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
-    const __m256i t1_3 =
-        _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
+    const __m128i t1_0 = _mm_mulhi_epu16(t0_0, _mm_set1_epi32(0x04000040));
+    const __m128i t1_1 = _mm_mulhi_epu16(t0_1, _mm_set1_epi32(0x04000040));
+    const __m128i t1_2 = _mm_mulhi_epu16(t0_2, _mm_set1_epi32(0x04000040));
+    const __m128i t1_3 = _mm_mulhi_epu16(t0_3, _mm_set1_epi32(0x04000040));
 
-    const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
-    const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
-    const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
-    const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
+    const __m128i t2_0 = _mm_and_si128(in0, _mm_set1_epi32(0x003f03f0));
+    const __m128i t2_1 = _mm_and_si128(in1, _mm_set1_epi32(0x003f03f0));
+    const __m128i t2_2 = _mm_and_si128(in2, _mm_set1_epi32(0x003f03f0));
+    const __m128i t2_3 = _mm_and_si128(in3, _mm_set1_epi32(0x003f03f0));
 
-    const __m256i t3_0 =
-        _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
-    const __m256i t3_1 =
-        _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
-    const __m256i t3_2 =
-        _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
-    const __m256i t3_3 =
-        _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
+    const __m128i t3_0 = _mm_mullo_epi16(t2_0, _mm_set1_epi32(0x01000010));
+    const __m128i t3_1 = _mm_mullo_epi16(t2_1, _mm_set1_epi32(0x01000010));
+    const __m128i t3_2 = _mm_mullo_epi16(t2_2, _mm_set1_epi32(0x01000010));
+    const __m128i t3_3 = _mm_mullo_epi16(t2_3, _mm_set1_epi32(0x01000010));
 
-    const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
-    const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
-    const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
-    const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
+    const __m128i input0 = _mm_or_si128(t1_0, t3_0);
+    const __m128i input1 = _mm_or_si128(t1_1, t3_1);
+    const __m128i input2 = _mm_or_si128(t1_2, t3_2);
+    const __m128i input3 = _mm_or_si128(t1_3, t3_3);
 
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved<isbase64url>(input0));
-    out += 32;
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
+                     lookup_pshufb_improved<isbase64url>(input0));
+    out += 16;
 
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved<isbase64url>(input1));
-    out += 32;
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
+                     lookup_pshufb_improved<isbase64url>(input1));
+    out += 16;
 
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved<isbase64url>(input2));
-    out += 32;
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved<isbase64url>(input3));
-    out += 32;
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
+                     lookup_pshufb_improved<isbase64url>(input2));
+    out += 16;
+
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
+                     lookup_pshufb_improved<isbase64url>(input3));
+    out += 16;
   }
-  for (; i + 28 <= srclen; i += 24) {
-    // lo = [xxxx|DDDC|CCBB|BAAA]
-    // hi = [xxxx|HHHG|GGFF|FEEE]
-    const __m128i lo =
-        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
-    const __m128i hi =
-        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3));
+  for (; i + 16 <= srclen; i += 12) {
+
+    __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
 
     // bytes from groups A, B and C are needed in separate 32-bit lanes
-    // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
-    __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);
+    // in = [DDDD|CCCC|BBBB|AAAA]
+    //
+    //      an input triplet has layout
+    //      [????????|ccdddddd|bbbbcccc|aaaaaabb]
+    //        byte 3   byte 2   byte 1   byte 0    -- byte 3 comes from the next
+    //        triplet
+    //
+    //      shuffling changes the order of bytes: 1, 0, 2, 1
+    //      [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc]
+    //           ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^
+    //                  processed bits
+    in = _mm_shuffle_epi8(in, shuf);
 
-    // this part is well commented in encode.sse.cpp
+    // unpacking
 
-    const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
-    const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
-    const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
-    const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
-    const __m256i indices = _mm256_or_si256(t1, t3);
+    // t0    = [0000cccc|cc000000|aaaaaa00|00000000]
+    const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00));
+    // t1    = [00000000|00cccccc|00000000|00aaaaaa]
+    //          (c * (1 << 10), a * (1 << 6)) >> 16 (note: an unsigned
+    //          multiplication)
+    const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
 
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved<isbase64url>(indices));
-    out += 32;
+    // t2    = [00000000|00dddddd|000000bb|bbbb0000]
+    const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0));
+    // t3    = [00dddddd|00000000|00bbbbbb|00000000](
+    //          (d * (1 << 8), b * (1 << 4))
+    const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
+
+    // res   = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3
+    const __m128i indices = _mm_or_si128(t1, t3);
+
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
+                     lookup_pshufb_improved<isbase64url>(indices));
+    out += 16;
   }
+
   return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
                                                         srclen - i, options);
 }
-
 static inline void compress(__m128i data, uint16_t mask, char *output) {
   if (mask == 0) {
     _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data);
     return;
   }
+
   // this particular implementation was inspired by work done by @animetosho
   // we do it in two steps, first 8 bytes and then second 8 bytes
   uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
@@ -28218,198 +40806,209 @@ static inline void compress(__m128i data, uint16_t mask, char *output) {
   __m128i compactmask = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
       tables::base64::pshufb_combine_table + pop1 * 8));
   __m128i answer = _mm_shuffle_epi8(pruned, compactmask);
-
   _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
 }
 
-static inline void compress(__m256i data, uint32_t mask, char *output) {
-  if (mask == 0) {
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), data);
-    return;
-  }
-  compress(_mm256_castsi256_si128(data), uint16_t(mask), output);
-  compress(_mm256_extracti128_si256(data, 1), uint16_t(mask >> 16),
-           output + _mm_popcnt_u32(~mask & 0xFFFF));
-}
-
 struct block64 {
-  __m256i chunks[2];
+  __m128i chunks[4];
 };
 
 template <bool base64_url>
-static inline uint32_t to_base64_mask(__m256i *src, uint32_t *error) {
-  const __m256i ascii_space_tbl =
-      _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa,
-                       0x0, 0xc, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0,
-                       0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0);
+static inline uint16_t to_base64_mask(__m128i *src, uint32_t *error) {
+  const __m128i ascii_space_tbl =
+      _mm_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, 0x0,
+                    0xc, 0xd, 0x0, 0x0);
   // credit: aqrit
-  __m256i delta_asso;
+  __m128i delta_asso;
   if (base64_url) {
-    delta_asso =
-        _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0,
-                         0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
-                         0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF);
+    delta_asso = _mm_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0,
+                               0x0, 0x0, 0x0, 0xF, 0x0, 0xF);
   } else {
-    delta_asso = _mm256_setr_epi8(
-        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
-  }
 
-  __m256i delta_values;
+    delta_asso = _mm_setr_epi8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                               0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
+  }
+  __m128i delta_values;
   if (base64_url) {
-    delta_values = _mm256_setr_epi8(
-        0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
-        uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0),
-        uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
-        uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3),
-        uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
+    delta_values = _mm_setr_epi8(0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
+                                 uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9),
+                                 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF),
+                                 uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
   } else {
-    delta_values = _mm256_setr_epi8(
-        int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
-        int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00),
-        int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
-        int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
-        int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9),
-        int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF),
-        int8_t(0xB9), int8_t(0xB9));
-  }
-  __m256i check_asso;
 
+    delta_values =
+        _mm_setr_epi8(int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+                      int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+                      int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
+                      int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9));
+  }
+  __m128i check_asso;
   if (base64_url) {
-    check_asso =
-        _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3,
-                         0x7, 0xB, 0xE, 0xB, 0x6, 0xD, 0x1, 0x1, 0x1, 0x1, 0x1,
-                         0x1, 0x1, 0x1, 0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6);
+    check_asso = _mm_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+                               0x3, 0x7, 0xB, 0xE, 0xB, 0x6);
   } else {
 
-    check_asso = _mm256_setr_epi8(
-        0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
-        0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-        0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
+    check_asso = _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                               0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
   }
-  __m256i check_values;
+  __m128i check_values;
   if (base64_url) {
-    check_values = _mm256_setr_epi8(
-        uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
-        uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), uint8_t(0xA6),
-        uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, uint8_t(0x80),
-        0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
-        uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6),
-        uint8_t(0xA6), uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0,
-        uint8_t(0x80), 0x0, uint8_t(0x80));
+    check_values = _mm_setr_epi8(uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
+                                 uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF),
+                                 uint8_t(0xB6), uint8_t(0xA6), uint8_t(0xB5),
+                                 uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0,
+                                 uint8_t(0x80), 0x0, uint8_t(0x80));
   } else {
-    check_values = _mm256_setr_epi8(
-        int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
-        int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86),
-        int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91),
-        int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
-        int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5),
-        int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80),
-        int8_t(0x91), int8_t(0x80));
+
+    check_values =
+        _mm_setr_epi8(int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+                      int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
+                      int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
+                      int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80));
   }
-  const __m256i shifted = _mm256_srli_epi32(*src, 3);
-  const __m256i delta_hash =
-      _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted);
-  const __m256i check_hash =
-      _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted);
-  const __m256i out =
-      _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src);
-  const __m256i chk =
-      _mm256_adds_epi8(_mm256_shuffle_epi8(check_values, check_hash), *src);
-  const int mask = _mm256_movemask_epi8(chk);
+  const __m128i shifted = _mm_srli_epi32(*src, 3);
+
+  const __m128i delta_hash =
+      _mm_avg_epu8(_mm_shuffle_epi8(delta_asso, *src), shifted);
+  const __m128i check_hash =
+      _mm_avg_epu8(_mm_shuffle_epi8(check_asso, *src), shifted);
+
+  const __m128i out =
+      _mm_adds_epi8(_mm_shuffle_epi8(delta_values, delta_hash), *src);
+  const __m128i chk =
+      _mm_adds_epi8(_mm_shuffle_epi8(check_values, check_hash), *src);
+  const int mask = _mm_movemask_epi8(chk);
   if (mask) {
-    __m256i ascii_space =
-        _mm256_cmpeq_epi8(_mm256_shuffle_epi8(ascii_space_tbl, *src), *src);
-    *error = (mask ^ _mm256_movemask_epi8(ascii_space));
+    __m128i ascii_space =
+        _mm_cmpeq_epi8(_mm_shuffle_epi8(ascii_space_tbl, *src), *src);
+    *error = (mask ^ _mm_movemask_epi8(ascii_space));
   }
   *src = out;
-  return (uint32_t)mask;
+  return (uint16_t)mask;
 }
 
 template <bool base64_url>
 static inline uint64_t to_base64_mask(block64 *b, uint64_t *error) {
   uint32_t err0 = 0;
   uint32_t err1 = 0;
+  uint32_t err2 = 0;
+  uint32_t err3 = 0;
   uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], &err0);
   uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], &err1);
-  *error = err0 | ((uint64_t)err1 << 32);
-  return m0 | (m1 << 32);
+  uint64_t m2 = to_base64_mask<base64_url>(&b->chunks[2], &err2);
+  uint64_t m3 = to_base64_mask<base64_url>(&b->chunks[3], &err3);
+  *error = (err0) | ((uint64_t)err1 << 16) | ((uint64_t)err2 << 32) |
+           ((uint64_t)err3 << 48);
+  return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48);
+}
+
+#if defined(_MSC_VER) && !defined(__clang__)
+static inline size_t simdutf_tzcnt_u64(uint64_t num) {
+  unsigned long ret;
+  if (num == 0) {
+    return 64;
+  }
+  _BitScanForward64(&ret, num);
+  return ret;
+}
+#else // GCC or Clang
+static inline size_t simdutf_tzcnt_u64(uint64_t num) {
+  return num ? __builtin_ctzll(num) : 64;
 }
+#endif
 
 static inline void copy_block(block64 *b, char *output) {
-  _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), b->chunks[0]);
-  _mm256_storeu_si256(reinterpret_cast<__m256i *>(output + 32), b->chunks[1]);
+  _mm_storeu_si128(reinterpret_cast<__m128i *>(output), b->chunks[0]);
+  _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 16), b->chunks[1]);
+  _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 32), b->chunks[2]);
+  _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 48), b->chunks[3]);
 }
 
 static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
   uint64_t nmask = ~mask;
-  compress(b->chunks[0], uint32_t(mask), output);
-  compress(b->chunks[1], uint32_t(mask >> 32),
+  compress(b->chunks[0], uint16_t(mask), output);
+  compress(b->chunks[1], uint16_t(mask >> 16),
+           output + _mm_popcnt_u64(nmask & 0xFFFF));
+  compress(b->chunks[2], uint16_t(mask >> 32),
            output + _mm_popcnt_u64(nmask & 0xFFFFFFFF));
+  compress(b->chunks[3], uint16_t(mask >> 48),
+           output + _mm_popcnt_u64(nmask & 0xFFFFFFFFFFFFULL));
   return _mm_popcnt_u64(nmask);
 }
 
 // The caller of this function is responsible to ensure that there are 64 bytes
 // available from reading at src. The data is read into a block64 structure.
 static inline void load_block(block64 *b, const char *src) {
-  b->chunks[0] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
-  b->chunks[1] =
-      _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
+  b->chunks[0] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+  b->chunks[1] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16));
+  b->chunks[2] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32));
+  b->chunks[3] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48));
 }
 
 // The caller of this function is responsible to ensure that there are 128 bytes
 // available from reading at src. The data is read into a block64 structure.
 static inline void load_block(block64 *b, const char16_t *src) {
-  __m256i m1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
-  __m256i m2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 16));
-  __m256i m3 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
-  __m256i m4 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 48));
-  __m256i m1p = _mm256_permute2x128_si256(m1, m2, 0x20);
-  __m256i m2p = _mm256_permute2x128_si256(m1, m2, 0x31);
-  __m256i m3p = _mm256_permute2x128_si256(m3, m4, 0x20);
-  __m256i m4p = _mm256_permute2x128_si256(m3, m4, 0x31);
-  b->chunks[0] = _mm256_packus_epi16(m1p, m2p);
-  b->chunks[1] = _mm256_packus_epi16(m3p, m4p);
+  __m128i m1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+  __m128i m2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 8));
+  __m128i m3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16));
+  __m128i m4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 24));
+  __m128i m5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32));
+  __m128i m6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 40));
+  __m128i m7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48));
+  __m128i m8 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 56));
+  b->chunks[0] = _mm_packus_epi16(m1, m2);
+  b->chunks[1] = _mm_packus_epi16(m3, m4);
+  b->chunks[2] = _mm_packus_epi16(m5, m6);
+  b->chunks[3] = _mm_packus_epi16(m7, m8);
 }
 
-static inline void base64_decode(char *out, __m256i str) {
+static inline void base64_decode(char *out, __m128i str) {
   // credit: aqrit
-  const __m256i pack_shuffle =
-      _mm256_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
-                       2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1);
-  const __m256i t0 = _mm256_maddubs_epi16(str, _mm256_set1_epi32(0x01400140));
-  const __m256i t1 = _mm256_madd_epi16(t0, _mm256_set1_epi32(0x00011000));
-  const __m256i t2 = _mm256_shuffle_epi8(t1, pack_shuffle);
 
+  const __m128i pack_shuffle =
+      _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1);
+
+  const __m128i t0 = _mm_maddubs_epi16(str, _mm_set1_epi32(0x01400140));
+  const __m128i t1 = _mm_madd_epi16(t0, _mm_set1_epi32(0x00011000));
+  const __m128i t2 = _mm_shuffle_epi8(t1, pack_shuffle);
   // Store the output:
-  _mm_storeu_si128((__m128i *)out, _mm256_castsi256_si128(t2));
-  _mm_storeu_si128((__m128i *)(out + 12), _mm256_extracti128_si256(t2, 1));
+  // this writes 16 bytes, but we only need 12.
+  _mm_storeu_si128((__m128i *)out, t2);
 }
 // decode 64 bytes and output 48 bytes
 static inline void base64_decode_block(char *out, const char *src) {
-  base64_decode(out,
-                _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)));
-  base64_decode(out + 24, _mm256_loadu_si256(
-                              reinterpret_cast<const __m256i *>(src + 32)));
+  base64_decode(out, _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
+  base64_decode(out + 12,
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16)));
+  base64_decode(out + 24,
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32)));
+  base64_decode(out + 36,
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48)));
 }
 static inline void base64_decode_block_safe(char *out, const char *src) {
-  base64_decode(out,
-                _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)));
-  char buffer[32]; // We enforce safety with a buffer.
-  base64_decode(
-      buffer, _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32)));
-  std::memcpy(out + 24, buffer, 24);
+  base64_decode(out, _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
+  base64_decode(out + 12,
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16)));
+  base64_decode(out + 24,
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32)));
+  char buffer[16];
+  base64_decode(buffer,
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48)));
+  std::memcpy(out + 36, buffer, 12);
 }
 static inline void base64_decode_block(char *out, block64 *b) {
   base64_decode(out, b->chunks[0]);
-  base64_decode(out + 24, b->chunks[1]);
+  base64_decode(out + 12, b->chunks[1]);
+  base64_decode(out + 24, b->chunks[2]);
+  base64_decode(out + 36, b->chunks[3]);
 }
 static inline void base64_decode_block_safe(char *out, block64 *b) {
   base64_decode(out, b->chunks[0]);
-  char buffer[32]; // We enforce safety with a buffer.
-  base64_decode(buffer, b->chunks[1]);
-  std::memcpy(out + 24, buffer, 24);
+  base64_decode(out + 12, b->chunks[1]);
+  base64_decode(out + 24, b->chunks[2]);
+  char buffer[16];
+  base64_decode(buffer, b->chunks[3]);
+  std::memcpy(out + 36, buffer, 12);
 }
 
 template <bool base64_url, typename chartype>
@@ -28456,7 +41055,7 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen,
   const chartype *const srcend = src + srclen;
 
   constexpr size_t block_size = 6;
-  static_assert(block_size >= 2, "block_size must be at least two");
+  static_assert(block_size >= 2, "block should of size 2 or more");
   char buffer[block_size * 64];
   char *bufferptr = buffer;
   if (srclen >= 64) {
@@ -28469,7 +41068,7 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen,
       uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
       if (error) {
         src -= 64;
-        size_t error_offset = _tzcnt_u64(error);
+        size_t error_offset = simdutf_tzcnt_u64(error);
         return {error_code::INVALID_BASE64_CHARACTER,
                 size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
       }
@@ -28512,7 +41111,6 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen,
   // time, otherwise, we should just decode directly.
   int last_block = (int)((bufferptr - buffer_start) % 64);
   if (last_block != 0 && srcend - src + last_block >= 64) {
-
     while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
       uint8_t val = to_base64[uint8_t(*src)];
       *bufferptr = char(val);
@@ -28598,15 +41196,15 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen,
   }
   return {SUCCESS, srclen, size_t(dst - dstinit)};
 }
-/* end file src/haswell/avx2_base64.cpp */
+/* end file src/westmere/sse_base64.cpp */
 
 } // unnamed namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
 
 /* begin file src/generic/buf_block_reader.h */
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
 
 // Walks through a buffer in block-sized increments, loading the last part with
@@ -28712,12 +41310,12 @@ simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
 }
 
 } // unnamed namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
 /* end file src/generic/buf_block_reader.h */
 /* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
 namespace utf8_validation {
 
@@ -28937,12 +41535,12 @@ struct utf8_checker {
 using utf8_validation::utf8_checker;
 
 } // unnamed namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
 /* begin file src/generic/utf8_validation/utf8_validator.h */
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
 namespace utf8_validation {
 
@@ -29077,14 +41675,14 @@ result generic_validate_ascii_with_errors(const char *input, size_t length) {
 
 } // namespace utf8_validation
 } // unnamed namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_validation/utf8_validator.h */
 // transcoding from UTF-8 to UTF-16
 /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
 
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
 namespace utf8_to_utf16 {
 
@@ -29155,13 +41753,13 @@ simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
 
 } // namespace utf8_to_utf16
 } // unnamed namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
 /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
 namespace utf8_to_utf16 {
 using namespace simd;
@@ -29490,14 +42088,14 @@ struct validating_transcoder {
 }; // struct utf8_checker
 } // namespace utf8_to_utf16
 } // unnamed namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 // transcoding from UTF-8 to UTF-32
 /* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
 
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
 namespace utf8_to_utf32 {
 
@@ -29536,13 +42134,13 @@ simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
 
 } // namespace utf8_to_utf32
 } // unnamed namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
 /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
 namespace utf8_to_utf32 {
 using namespace simd;
@@ -29857,14 +42455,14 @@ struct validating_transcoder {
 }; // struct utf8_checker
 } // namespace utf8_to_utf32
 } // unnamed namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 // other functions
 /* begin file src/generic/utf8.h */
 
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
 namespace utf8 {
 
@@ -29899,12 +42497,12 @@ simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
 }
 } // namespace utf8
 } // unnamed namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8.h */
 /* begin file src/generic/utf16.h */
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
 namespace utf16 {
 
@@ -29974,15 +42572,14 @@ change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
 
 } // namespace utf16
 } // unnamed namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf16.h */
-
 // transcoding from UTF-8 to Latin 1
 /* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
 
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
 namespace utf8_to_latin1 {
 using namespace simd;
@@ -30292,13 +42889,13 @@ struct validating_transcoder {
 }; // struct utf8_checker
 } // namespace utf8_to_latin1
 } // unnamed namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
 /* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
 
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
 namespace utf8_to_latin1 {
 using namespace simd;
@@ -30372,19 +42969,24 @@ simdutf_really_inline size_t convert_valid(const char *in, size_t size,
 
 } // namespace utf8_to_latin1
 } // namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
   // namespace simdutf
 /* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
 
+//
+// Implementation-specific overrides
+//
+
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 
 simdutf_warn_unused int
 implementation::detect_encodings(const char *input,
                                  size_t length) const noexcept {
   // If there is a BOM, then we trust it.
   auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  // todo: reimplement as a one-pass algorithm.
   if (bom_encoding != encoding_type::unspecified) {
     return bom_encoding;
   }
@@ -30408,22 +43010,23 @@ implementation::detect_encodings(const char *input,
 
 simdutf_warn_unused bool
 implementation::validate_utf8(const char *buf, size_t len) const noexcept {
-  return haswell::utf8_validation::generic_validate_utf8(buf, len);
+  return westmere::utf8_validation::generic_validate_utf8(buf, len);
 }
 
 simdutf_warn_unused result implementation::validate_utf8_with_errors(
     const char *buf, size_t len) const noexcept {
-  return haswell::utf8_validation::generic_validate_utf8_with_errors(buf, len);
+  return westmere::utf8_validation::generic_validate_utf8_with_errors(buf, len);
 }
 
 simdutf_warn_unused bool
 implementation::validate_ascii(const char *buf, size_t len) const noexcept {
-  return haswell::utf8_validation::generic_validate_ascii(buf, len);
+  return westmere::utf8_validation::generic_validate_ascii(buf, len);
 }
 
 simdutf_warn_unused result implementation::validate_ascii_with_errors(
     const char *buf, size_t len) const noexcept {
-  return haswell::utf8_validation::generic_validate_ascii_with_errors(buf, len);
+  return westmere::utf8_validation::generic_validate_ascii_with_errors(buf,
+                                                                       len);
 }
 
 simdutf_warn_unused bool
@@ -30434,7 +43037,7 @@ implementation::validate_utf16le(const char16_t *buf,
     // handling nullptr
     return true;
   }
-  const char16_t *tail = avx2_validate_utf16<endianness::LITTLE>(buf, len);
+  const char16_t *tail = sse_validate_utf16<endianness::LITTLE>(buf, len);
   if (tail) {
     return scalar::utf16::validate<endianness::LITTLE>(tail,
                                                        len - (tail - buf));
@@ -30451,7 +43054,7 @@ implementation::validate_utf16be(const char16_t *buf,
     // handling nullptr
     return true;
   }
-  const char16_t *tail = avx2_validate_utf16<endianness::BIG>(buf, len);
+  const char16_t *tail = sse_validate_utf16<endianness::BIG>(buf, len);
   if (tail) {
     return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
   } else {
@@ -30461,7 +43064,7 @@ implementation::validate_utf16be(const char16_t *buf,
 
 simdutf_warn_unused result implementation::validate_utf16le_with_errors(
     const char16_t *buf, size_t len) const noexcept {
-  result res = avx2_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+  result res = sse_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
   if (res.count != len) {
     result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(
         buf + res.count, len - res.count);
@@ -30473,7 +43076,7 @@ simdutf_warn_unused result implementation::validate_utf16le_with_errors(
 
 simdutf_warn_unused result implementation::validate_utf16be_with_errors(
     const char16_t *buf, size_t len) const noexcept {
-  result res = avx2_validate_utf16_with_errors<endianness::BIG>(buf, len);
+  result res = sse_validate_utf16_with_errors<endianness::BIG>(buf, len);
   if (res.count != len) {
     result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(
         buf + res.count, len - res.count);
@@ -30490,7 +43093,7 @@ implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
     // handling nullptr
     return true;
   }
-  const char32_t *tail = avx2_validate_utf32le(buf, len);
+  const char32_t *tail = sse_validate_utf32le(buf, len);
   if (tail) {
     return scalar::utf32::validate(tail, len - (tail - buf));
   } else {
@@ -30500,12 +43103,12 @@ implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
 
 simdutf_warn_unused result implementation::validate_utf32_with_errors(
     const char32_t *buf, size_t len) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
+  if (len == 0) {
     // empty input is valid UTF-32. protect the implementation from
     // handling nullptr
     return result(error_code::SUCCESS, 0);
   }
-  result res = avx2_validate_utf32le_with_errors(buf, len);
+  result res = sse_validate_utf32le_with_errors(buf, len);
   if (res.count != len) {
     result scalar_res =
         scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
@@ -30517,8 +43120,9 @@ simdutf_warn_unused result implementation::validate_utf32_with_errors(
 
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
     const char *buf, size_t len, char *utf8_output) const noexcept {
+
   std::pair<const char *, char *> ret =
-      avx2_convert_latin1_to_utf8(buf, len, utf8_output);
+      sse_convert_latin1_to_utf8(buf, len, utf8_output);
   size_t converted_chars = ret.second - utf8_output;
 
   if (ret.first != buf + len) {
@@ -30533,7 +43137,7 @@ simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
     const char *buf, size_t len, char16_t *utf16_output) const noexcept {
   std::pair<const char *, char16_t *> ret =
-      avx2_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+      sse_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -30553,7 +43157,7 @@ simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
     const char *buf, size_t len, char16_t *utf16_output) const noexcept {
   std::pair<const char *, char16_t *> ret =
-      avx2_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
+      sse_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -30573,7 +43177,7 @@ simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
     const char *buf, size_t len, char32_t *utf32_output) const noexcept {
   std::pair<const char *, char32_t *> ret =
-      avx2_convert_latin1_to_utf32(buf, len, utf32_output);
+      sse_convert_latin1_to_utf32(buf, len, utf32_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -30602,8 +43206,8 @@ simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
-    const char *input, size_t size, char *latin1_output) const noexcept {
-  return utf8_to_latin1::convert_valid(input, size, latin1_output);
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  return westmere::utf8_to_latin1::convert_valid(buf, len, latin1_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
@@ -30663,12 +43267,12 @@ simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
 simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
   std::pair<const char16_t *, char *> ret =
-      haswell::avx2_convert_utf16_to_latin1<endianness::LITTLE>(buf, len,
-                                                                latin1_output);
+      sse_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
   if (ret.first == nullptr) {
     return 0;
   }
   size_t saved_bytes = ret.second - latin1_output;
+
   if (ret.first != buf + len) {
     const size_t scalar_saved_bytes =
         scalar::utf16_to_latin1::convert<endianness::LITTLE>(
@@ -30684,12 +43288,12 @@ simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
 simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
   std::pair<const char16_t *, char *> ret =
-      haswell::avx2_convert_utf16_to_latin1<endianness::BIG>(buf, len,
-                                                             latin1_output);
+      sse_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
   if (ret.first == nullptr) {
     return 0;
   }
   size_t saved_bytes = ret.second - latin1_output;
+
   if (ret.first != buf + len) {
     const size_t scalar_saved_bytes =
         scalar::utf16_to_latin1::convert<endianness::BIG>(
@@ -30706,7 +43310,7 @@ simdutf_warn_unused result
 implementation::convert_utf16le_to_latin1_with_errors(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
   std::pair<result, char *> ret =
-      avx2_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
+      sse_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
           buf, len, latin1_output);
   if (ret.first.error) {
     return ret.first;
@@ -30733,8 +43337,8 @@ simdutf_warn_unused result
 implementation::convert_utf16be_to_latin1_with_errors(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
   std::pair<result, char *> ret =
-      avx2_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
-                                                                latin1_output);
+      sse_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
+                                                               latin1_output);
   if (ret.first.error) {
     return ret.first;
   } // Can return directly since scalar fallback already found correct
@@ -30758,21 +43362,20 @@ implementation::convert_utf16be_to_latin1_with_errors(
 
 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  // optimization opportunity: implement a custom function
+  // optimization opportunity: we could provide an optimized function.
   return convert_utf16be_to_latin1(buf, len, latin1_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  // optimization opportunity: implement a custom function
+  // optimization opportunity: we could provide an optimized function.
   return convert_utf16le_to_latin1(buf, len, latin1_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
     const char16_t *buf, size_t len, char *utf8_output) const noexcept {
   std::pair<const char16_t *, char *> ret =
-      haswell::avx2_convert_utf16_to_utf8<endianness::LITTLE>(buf, len,
-                                                              utf8_output);
+      sse_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -30792,8 +43395,7 @@ simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
     const char16_t *buf, size_t len, char *utf8_output) const noexcept {
   std::pair<const char16_t *, char *> ret =
-      haswell::avx2_convert_utf16_to_utf8<endianness::BIG>(buf, len,
-                                                           utf8_output);
+      sse_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -30815,7 +43417,7 @@ simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char *> ret =
-      haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(
+      westmere::sse_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(
           buf, len, utf8_output);
   if (ret.first.error) {
     return ret.first;
@@ -30843,7 +43445,7 @@ simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char *> ret =
-      haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::BIG>(
+      westmere::sse_convert_utf16_to_utf8_with_errors<endianness::BIG>(
           buf, len, utf8_output);
   if (ret.first.error) {
     return ret.first;
@@ -30876,34 +43478,16 @@ simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
   return convert_utf16be_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  std::pair<const char32_t *, char *> ret =
-      avx2_convert_utf32_to_utf8(buf, len, utf8_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
-        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
 simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
     const char32_t *buf, size_t len, char *latin1_output) const noexcept {
   std::pair<const char32_t *, char *> ret =
-      avx2_convert_utf32_to_latin1(buf, len, latin1_output);
+      sse_convert_utf32_to_latin1(buf, len, latin1_output);
   if (ret.first == nullptr) {
     return 0;
   }
   size_t saved_bytes = ret.second - latin1_output;
-  if (ret.first != buf + len) {
+  // if (ret.first != buf + len) {
+  if (ret.first < buf + len) {
     const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
         ret.first, len - (ret.first - buf), ret.second);
     if (scalar_saved_bytes == 0) {
@@ -30919,7 +43503,8 @@ simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char *> ret =
-      avx2_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
+      westmere::sse_convert_utf32_to_latin1_with_errors(buf, len,
+                                                        latin1_output);
   if (ret.first.count != len) {
     result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
         buf + ret.first.count, len - ret.first.count, ret.second);
@@ -30938,15 +43523,35 @@ simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
 
 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
     const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  // optimization opportunity: we could provide an optimized function.
   return convert_utf32_to_latin1(buf, len, latin1_output);
 }
 
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char32_t *, char *> ret =
+      sse_convert_utf32_to_utf8(buf, len, utf8_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
 simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
     const char32_t *buf, size_t len, char *utf8_output) const noexcept {
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char *> ret =
-      haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+      westmere::sse_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
   if (ret.first.count != len) {
     result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
         buf + ret.first.count, len - ret.first.count, ret.second);
@@ -30966,8 +43571,7 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
     const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
   std::pair<const char16_t *, char32_t *> ret =
-      haswell::avx2_convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
-                                                               utf32_output);
+      sse_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -30987,8 +43591,7 @@ simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
     const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
   std::pair<const char16_t *, char32_t *> ret =
-      haswell::avx2_convert_utf16_to_utf32<endianness::BIG>(buf, len,
-                                                            utf32_output);
+      sse_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -31010,7 +43613,7 @@ simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char32_t *> ret =
-      haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(
+      westmere::sse_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(
           buf, len, utf32_output);
   if (ret.first.error) {
     return ret.first;
@@ -31038,7 +43641,7 @@ simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char32_t *> ret =
-      haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::BIG>(
+      westmere::sse_convert_utf16_to_utf32_with_errors<endianness::BIG>(
           buf, len, utf32_output);
   if (ret.first.error) {
     return ret.first;
@@ -31069,7 +43672,7 @@ simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
     const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
   std::pair<const char32_t *, char16_t *> ret =
-      avx2_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+      sse_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -31089,7 +43692,7 @@ simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
     const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
   std::pair<const char32_t *, char16_t *> ret =
-      avx2_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+      sse_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -31111,7 +43714,7 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char16_t *> ret =
-      haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(
+      westmere::sse_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(
           buf, len, utf16_output);
   if (ret.first.count != len) {
     result scalar_res =
@@ -31135,7 +43738,7 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char16_t *> ret =
-      haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::BIG>(
+      westmere::sse_convert_utf32_to_utf16_with_errors<endianness::BIG>(
           buf, len, utf16_output);
   if (ret.first.count != len) {
     result scalar_res =
@@ -31220,26 +43823,11 @@ simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
   return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
-}
-
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
-}
-
 simdutf_warn_unused size_t
 implementation::utf16_length_from_latin1(size_t length) const noexcept {
   return scalar::latin1::utf16_length_from_latin1(length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
-    const char *input, size_t length) const noexcept {
-  return utf8::utf16_length_from_utf8(input, length);
-}
-
 simdutf_warn_unused size_t
 implementation::utf32_length_from_latin1(size_t length) const noexcept {
   return scalar::latin1::utf32_length_from_latin1(length);
@@ -31247,91 +43835,110 @@ implementation::utf32_length_from_latin1(size_t length) const noexcept {
 
 simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
     const char *input, size_t len) const noexcept {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
-  size_t answer = len / sizeof(__m256i) * sizeof(__m256i);
+  const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
+  size_t answer = len / sizeof(__m128i) * sizeof(__m128i);
   size_t i = 0;
   if (answer >= 2048) { // long strings optimization
-    __m256i four_64bits = _mm256_setzero_si256();
-    while (i + sizeof(__m256i) <= len) {
-      __m256i runner = _mm256_setzero_si256();
-      // We can do up to 255 loops without overflow.
-      size_t iterations = (len - i) / sizeof(__m256i);
+    __m128i two_64bits = _mm_setzero_si128();
+    while (i + sizeof(__m128i) <= len) {
+      __m128i runner = _mm_setzero_si128();
+      size_t iterations = (len - i) / sizeof(__m128i);
       if (iterations > 255) {
         iterations = 255;
       }
-      size_t max_i = i + iterations * sizeof(__m256i) - sizeof(__m256i);
-      for (; i + 4 * sizeof(__m256i) <= max_i; i += 4 * sizeof(__m256i)) {
-        __m256i input1 = _mm256_loadu_si256((const __m256i *)(data + i));
-        __m256i input2 =
-            _mm256_loadu_si256((const __m256i *)(data + i + sizeof(__m256i)));
-        __m256i input3 = _mm256_loadu_si256(
-            (const __m256i *)(data + i + 2 * sizeof(__m256i)));
-        __m256i input4 = _mm256_loadu_si256(
-            (const __m256i *)(data + i + 3 * sizeof(__m256i)));
-        __m256i input12 =
-            _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input1),
-                            _mm256_cmpgt_epi8(_mm256_setzero_si256(), input2));
-        __m256i input23 =
-            _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input3),
-                            _mm256_cmpgt_epi8(_mm256_setzero_si256(), input4));
-        __m256i input1234 = _mm256_add_epi8(input12, input23);
-        runner = _mm256_sub_epi8(runner, input1234);
+      size_t max_i = i + iterations * sizeof(__m128i) - sizeof(__m128i);
+      for (; i + 4 * sizeof(__m128i) <= max_i; i += 4 * sizeof(__m128i)) {
+        __m128i input1 = _mm_loadu_si128((const __m128i *)(str + i));
+        __m128i input2 =
+            _mm_loadu_si128((const __m128i *)(str + i + sizeof(__m128i)));
+        __m128i input3 =
+            _mm_loadu_si128((const __m128i *)(str + i + 2 * sizeof(__m128i)));
+        __m128i input4 =
+            _mm_loadu_si128((const __m128i *)(str + i + 3 * sizeof(__m128i)));
+        __m128i input12 =
+            _mm_add_epi8(_mm_cmpgt_epi8(_mm_setzero_si128(), input1),
+                         _mm_cmpgt_epi8(_mm_setzero_si128(), input2));
+        __m128i input34 =
+            _mm_add_epi8(_mm_cmpgt_epi8(_mm_setzero_si128(), input3),
+                         _mm_cmpgt_epi8(_mm_setzero_si128(), input4));
+        __m128i input1234 = _mm_add_epi8(input12, input34);
+        runner = _mm_sub_epi8(runner, input1234);
       }
-      for (; i <= max_i; i += sizeof(__m256i)) {
-        __m256i input_256_chunk =
-            _mm256_loadu_si256((const __m256i *)(data + i));
-        runner = _mm256_sub_epi8(
-            runner, _mm256_cmpgt_epi8(_mm256_setzero_si256(), input_256_chunk));
+      for (; i <= max_i; i += sizeof(__m128i)) {
+        __m128i more_input = _mm_loadu_si128((const __m128i *)(str + i));
+        runner = _mm_sub_epi8(runner,
+                              _mm_cmpgt_epi8(_mm_setzero_si128(), more_input));
       }
-      four_64bits = _mm256_add_epi64(
-          four_64bits, _mm256_sad_epu8(runner, _mm256_setzero_si256()));
+      two_64bits =
+          _mm_add_epi64(two_64bits, _mm_sad_epu8(runner, _mm_setzero_si128()));
     }
-    answer += _mm256_extract_epi64(four_64bits, 0) +
-              _mm256_extract_epi64(four_64bits, 1) +
-              _mm256_extract_epi64(four_64bits, 2) +
-              _mm256_extract_epi64(four_64bits, 3);
-  } else if (answer > 0) {
-    for (; i + sizeof(__m256i) <= len; i += sizeof(__m256i)) {
-      __m256i latin = _mm256_loadu_si256((const __m256i *)(data + i));
-      uint32_t non_ascii = _mm256_movemask_epi8(latin);
+    answer +=
+        _mm_extract_epi64(two_64bits, 0) + _mm_extract_epi64(two_64bits, 1);
+  } else if (answer > 0) { // short string optimization
+    for (; i + 2 * sizeof(__m128i) <= len; i += 2 * sizeof(__m128i)) {
+      __m128i latin = _mm_loadu_si128((const __m128i *)(input + i));
+      uint16_t non_ascii = (uint16_t)_mm_movemask_epi8(latin);
+      answer += count_ones(non_ascii);
+      latin = _mm_loadu_si128((const __m128i *)(input + i) + 1);
+      non_ascii = (uint16_t)_mm_movemask_epi8(latin);
+      answer += count_ones(non_ascii);
+    }
+    for (; i + sizeof(__m128i) <= len; i += sizeof(__m128i)) {
+      __m128i latin = _mm_loadu_si128((const __m128i *)(input + i));
+      uint16_t non_ascii = (uint16_t)_mm_movemask_epi8(latin);
       answer += count_ones(non_ascii);
     }
   }
   return answer + scalar::latin1::utf8_length_from_latin1(
-                      reinterpret_cast<const char *>(data + i), len - i);
+                      reinterpret_cast<const char *>(str + i), len - i);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return utf8::utf16_length_from_utf8(input, length);
 }
 
 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
     const char32_t *input, size_t length) const noexcept {
-  const __m256i v_00000000 = _mm256_setzero_si256();
-  const __m256i v_ffffff80 = _mm256_set1_epi32((uint32_t)0xffffff80);
-  const __m256i v_fffff800 = _mm256_set1_epi32((uint32_t)0xfffff800);
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+  const __m128i v_00000000 = _mm_setzero_si128();
+  const __m128i v_ffffff80 = _mm_set1_epi32((uint32_t)0xffffff80);
+  const __m128i v_fffff800 = _mm_set1_epi32((uint32_t)0xfffff800);
+  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
   size_t pos = 0;
   size_t count = 0;
-  for (; pos + 8 <= length; pos += 8) {
-    __m256i in = _mm256_loadu_si256((__m256i *)(input + pos));
-    const __m256i ascii_bytes_bytemask =
-        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffffff80), v_00000000);
-    const __m256i one_two_bytes_bytemask =
-        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_fffff800), v_00000000);
-    const __m256i two_bytes_bytemask =
-        _mm256_xor_si256(one_two_bytes_bytemask, ascii_bytes_bytemask);
-    const __m256i one_two_three_bytes_bytemask =
-        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-    const __m256i three_bytes_bytemask =
-        _mm256_xor_si256(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
-    const uint32_t ascii_bytes_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(ascii_bytes_bytemask));
-    const uint32_t two_bytes_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(two_bytes_bytemask));
-    const uint32_t three_bytes_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(three_bytes_bytemask));
+  for (; pos + 4 <= length; pos += 4) {
+    __m128i in = _mm_loadu_si128((__m128i *)(input + pos));
+    const __m128i ascii_bytes_bytemask =
+        _mm_cmpeq_epi32(_mm_and_si128(in, v_ffffff80), v_00000000);
+    const __m128i one_two_bytes_bytemask =
+        _mm_cmpeq_epi32(_mm_and_si128(in, v_fffff800), v_00000000);
+    const __m128i two_bytes_bytemask =
+        _mm_xor_si128(one_two_bytes_bytemask, ascii_bytes_bytemask);
+    const __m128i one_two_three_bytes_bytemask =
+        _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
+    const __m128i three_bytes_bytemask =
+        _mm_xor_si128(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
+    const uint16_t ascii_bytes_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(ascii_bytes_bytemask));
+    const uint16_t two_bytes_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(two_bytes_bytemask));
+    const uint16_t three_bytes_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(three_bytes_bytemask));
 
     size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
     size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
     size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
-    count += 32 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
+    count += 16 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
   }
   return count +
          scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
@@ -31339,18 +43946,18 @@ simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
 
 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
     const char32_t *input, size_t length) const noexcept {
-  const __m256i v_00000000 = _mm256_setzero_si256();
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+  const __m128i v_00000000 = _mm_setzero_si128();
+  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
   size_t pos = 0;
   size_t count = 0;
-  for (; pos + 8 <= length; pos += 8) {
-    __m256i in = _mm256_loadu_si256((__m256i *)(input + pos));
-    const __m256i surrogate_bytemask =
-        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-    const uint32_t surrogate_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(surrogate_bytemask));
-    size_t surrogate_count = (32 - count_ones(surrogate_bitmask)) / 4;
-    count += 8 + surrogate_count;
+  for (; pos + 4 <= length; pos += 4) {
+    __m128i in = _mm_loadu_si128((__m128i *)(input + pos));
+    const __m128i surrogate_bytemask =
+        _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
+    const uint16_t surrogate_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(surrogate_bytemask));
+    size_t surrogate_count = (16 - count_ones(surrogate_bitmask)) / 4;
+    count += 4 + surrogate_count;
   }
   return count +
          scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
@@ -31386,3612 +43993,5867 @@ simdutf_warn_unused full_result implementation::base64_to_binary_details(
                                              last_chunk_options);
 }
 
-simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::base64::maximal_binary_length_from_base64(input, length);
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
+}
+
+simdutf_warn_unused size_t implementation::base64_length_from_binary(
+    size_t length, base64_options options) const noexcept {
+  return scalar::base64::base64_length_from_binary(length, options);
+}
+
+size_t implementation::binary_to_base64(const char *input, size_t length,
+                                        char *output,
+                                        base64_options options) const noexcept {
+  if (options & base64_url) {
+    return encode_base64<true>(output, input, length, options);
+  } else {
+    return encode_base64<false>(output, input, length, options);
+  }
+}
+} // namespace westmere
+} // namespace simdutf
+
+/* begin file src/simdutf/westmere/end.h */
+#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
+// nothing needed.
+#else
+SIMDUTF_UNTARGET_REGION
+#endif
+
+/* end file src/simdutf/westmere/end.h */
+/* end file src/westmere/implementation.cpp */
+#endif
+#if SIMDUTF_IMPLEMENTATION_LSX
+/* begin file src/lsx/implementation.cpp */
+/* begin file src/simdutf/lsx/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "lsx"
+// #define SIMDUTF_IMPLEMENTATION lsx
+/* end file src/simdutf/lsx/begin.h */
+namespace simdutf {
+namespace lsx {
+namespace {
+#ifndef SIMDUTF_LSX_H
+  #error "lsx.h must be included"
+#endif
+using namespace simd;
+
+// convert vmskltz/vmskgez/vmsknz to
+// simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes index
+const uint8_t lsx_1_2_utf8_bytes_mask[] = {
+    0,   1,   4,   5,   16,  17,  20,  21,  64,  65,  68,  69,  80,  81,  84,
+    85,  2,   3,   6,   7,   18,  19,  22,  23,  66,  67,  70,  71,  82,  83,
+    86,  87,  8,   9,   12,  13,  24,  25,  28,  29,  72,  73,  76,  77,  88,
+    89,  92,  93,  10,  11,  14,  15,  26,  27,  30,  31,  74,  75,  78,  79,
+    90,  91,  94,  95,  32,  33,  36,  37,  48,  49,  52,  53,  96,  97,  100,
+    101, 112, 113, 116, 117, 34,  35,  38,  39,  50,  51,  54,  55,  98,  99,
+    102, 103, 114, 115, 118, 119, 40,  41,  44,  45,  56,  57,  60,  61,  104,
+    105, 108, 109, 120, 121, 124, 125, 42,  43,  46,  47,  58,  59,  62,  63,
+    106, 107, 110, 111, 122, 123, 126, 127, 128, 129, 132, 133, 144, 145, 148,
+    149, 192, 193, 196, 197, 208, 209, 212, 213, 130, 131, 134, 135, 146, 147,
+    150, 151, 194, 195, 198, 199, 210, 211, 214, 215, 136, 137, 140, 141, 152,
+    153, 156, 157, 200, 201, 204, 205, 216, 217, 220, 221, 138, 139, 142, 143,
+    154, 155, 158, 159, 202, 203, 206, 207, 218, 219, 222, 223, 160, 161, 164,
+    165, 176, 177, 180, 181, 224, 225, 228, 229, 240, 241, 244, 245, 162, 163,
+    166, 167, 178, 179, 182, 183, 226, 227, 230, 231, 242, 243, 246, 247, 168,
+    169, 172, 173, 184, 185, 188, 189, 232, 233, 236, 237, 248, 249, 252, 253,
+    170, 171, 174, 175, 186, 187, 190, 191, 234, 235, 238, 239, 250, 251, 254,
+    255};
+
+simdutf_really_inline __m128i lsx_swap_bytes(__m128i vec) {
+  // const v16u8 shuf = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+  // return __lsx_vshuf_b(__lsx_vldi(0), vec, shuf);
+  return __lsx_vshuf4i_b(vec, 0b10110001);
+  // return __lsx_vor_v(__lsx_vslli_h(vec, 8), __lsx_vsrli_h(vec, 8));
+}
+
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
+  return input.is_ascii();
+}
+
+simdutf_unused simdutf_really_inline simd8<bool>
+must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2,
+                     const simd8<uint8_t> prev3) {
+  simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
+  simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
+  simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+  // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller
+  // is using ^ as well. This will work fine because we only have to report
+  // errors for cases with 0-1 lead bytes. Multiple lead bytes implies 2
+  // overlapping multibyte characters, and if that happens, there is guaranteed
+  // to be at least *one* lead byte that is part of only 1 other multibyte
+  // character. The error will be detected there.
+  return is_second_byte ^ is_third_byte ^ is_fourth_byte;
+}
+
+simdutf_really_inline simd8<bool>
+must_be_2_3_continuation(const simd8<uint8_t> prev2,
+                         const simd8<uint8_t> prev3) {
+  simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
+  simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+  return is_third_byte ^ is_fourth_byte;
+}
+
+// common functions for utf8 conversions
+simdutf_really_inline __m128i convert_utf8_3_byte_to_utf16(__m128i in) {
+  // Low half contains  10bbbbbb|10cccccc
+  // High half contains 1110aaaa|1110aaaa
+  const v16u8 sh = {2, 1, 5, 4, 8, 7, 11, 10, 0, 0, 3, 3, 6, 6, 9, 9};
+  const v8u16 v0fff = {0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff};
+
+  __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, (__m128i)sh);
+  // 1110aaaa => aaaa0000
+  __m128i perm_high = __lsx_vslli_b(__lsx_vbsrl_v(perm, 8), 4);
+  // 10bbbbbb 10cccccc => 0010bbbb bbcccccc
+  __m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), /* perm >> 2*/
+                                     perm, __lsx_vrepli_h(0x3f) /* 0x003f */);
+  // 0010bbbb bbcccccc => aaaabbbb bbcccccc
+  composed = __lsx_vbitsel_v(perm_high, composed, (__m128i)v0fff);
+
+  return composed;
+}
+
+simdutf_really_inline __m128i convert_utf8_2_byte_to_utf16(__m128i in) {
+  // 10bbbbb 110aaaaa => 00bbbbb 000aaaaa
+  __m128i composed = __lsx_vand_v(in, __lsx_vldi(0x3f));
+  // 00bbbbbb 000aaaaa => 00000aaa aabbbbbb
+  composed = __lsx_vbitsel_v(
+      __lsx_vsrli_h(__lsx_vslli_h(composed, 8), 2), /* (aaaaa << 8) >> 2 */
+      __lsx_vsrli_h(composed, 8),                   /* bbbbbb >> 8 */
+      __lsx_vrepli_h(0x3f));                        /* 0x003f */
+  return composed;
+}
+
+simdutf_really_inline __m128i
+convert_utf8_1_to_2_byte_to_utf16(__m128i in, size_t shufutf8_idx) {
+  // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters.
+  // This is a relatively easy scenario
+  // we process SIX (6) input code-code units. The max length in bytes of six
+  // code code units spanning between 1 and 2 bytes each is 12 bytes.
+  __m128i sh =
+      __lsx_vld(reinterpret_cast<const uint8_t *>(
+                    simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx]),
+                0);
+  // Shuffle
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 110aaaaa 10bbbbbb
+  __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh);
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 00000000 00bbbbbb
+  __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_h(0x7f)); // 6 or 7 bits
+  // 1 byte: 00000000 00000000
+  // 2 byte: 00000aaa aa000000
+  const __m128i v1f00 = __lsx_vldi(-2785); // -2785(13bit) => 151f
+  __m128i composed = __lsx_vsrli_h(__lsx_vand_v(perm, v1f00), 2); // 5 bits
+  // Combine with a shift right accumulate
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 00000aaa aabbbbbb
+  composed = __lsx_vadd_h(ascii, composed);
+  return composed;
+}
+
+/* begin file src/lsx/lsx_validate_utf16.cpp */
+/*
+    In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning.
+
+    In a vectorized algorithm we want to examine the most significant
+    nibble in order to select a fast path. If none of highest nibbles
+    are 0xD (13), than we are sure that UTF-16 chunk in a vector
+    register is valid.
+
+    Let us analyze what we need to check if the nibble is 0xD. The
+    value of the preceding nibble determines what we have:
+
+    0xd000 .. 0xd7ff - a valid word
+    0xd800 .. 0xdbff - low surrogate
+    0xdc00 .. 0xdfff - high surrogate
+
+    Other constraints we have to consider:
+    - there must not be two consecutive low surrogates (0xd800 .. 0xdbff)
+    - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
+    - there must not be sole low surrogate nor high surrogate
+
+    We're going to build three bitmasks based on the 3rd nibble:
+    - V = valid word,
+    - L = low surrogate (0xd800 .. 0xdbff)
+    - H = high surrogate (0xdc00 .. 0xdfff)
+
+      0   1   2   3   4   5   6   7    <--- word index
+    [ V | L | H | L | H | V | V | L ]
+      1   0   0   0   0   1   1   0     - V = valid masks
+      0   1   0   1   0   0   0   1     - L = low surrogate
+      0   0   1   0   1   0   0   0     - H high surrogate
+
+
+      1   0   0   0   0   1   1   0   V = valid masks
+      0   1   0   1   0   0   0   0   a = L & (H >> 1)
+      0   0   1   0   1   0   0   0   b = a << 1
+      1   1   1   1   1   1   1   0   c = V | a | b
+                                  ^
+                                  the last bit can be zero, we just consume 7
+   code units and recheck this word in the next iteration
+*/
+
+/* Returns:
+   - pointer to the last unprocessed character (a scalar fallback should check
+   the rest);
+   - nullptr if an error was detected.
+*/
+template <endianness big_endian>
+const char16_t *lsx_validate_utf16(const char16_t *input, size_t size) {
+  const char16_t *end = input + size;
+
+  const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+  const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+  const auto v_fc = simd8<uint8_t>::splat(0xfc);
+  const auto v_dc = simd8<uint8_t>::splat(0xdc);
+
+  while (input + simd16<uint16_t>::SIZE * 2 < end) {
+    // 0. Load data: since the validation takes into account only higher
+    //    byte of each word, we compress the two vectors into one which
+    //    consists only the higher bytes.
+    auto in0 = simd16<uint16_t>(input);
+    auto in1 =
+        simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
+    if (big_endian) {
+      in0 = in0.swap_bytes();
+      in1 = in1.swap_bytes();
+    }
+    const auto in = simd8<uint8_t>(__lsx_vssrlni_bu_h(in1.value, in0.value, 8));
+
+    // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+    const auto surrogates_wordmask = (in & v_f8) == v_d8;
+    const uint16_t surrogates_bitmask =
+        static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
+    if (surrogates_bitmask == 0x0000) {
+      input += 16;
+    } else {
+      // 2. We have some surrogates that have to be distinguished:
+      //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+      //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+      //
+      //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+      // V - non-surrogate code units
+      //     V = not surrogates_wordmask
+      const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
+
+      // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+      const auto vH = (in & v_fc) == v_dc;
+      const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
+
+      // L - word mask for low surrogates
+      //     L = not H and surrogates_wordmask
+      const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
+
+      const uint16_t a = static_cast<uint16_t>(
+          L & (H >> 1)); // A low surrogate must be followed by high one.
+                         // (A low surrogate placed in the 7th register's word
+                         // is an exception we handle.)
+      const uint16_t b = static_cast<uint16_t>(
+          a << 1); // Just mark that the opinput - startite fact is hold,
+                   // thanks to that we have only two masks for valid case.
+      const uint16_t c = static_cast<uint16_t>(
+          V | a | b); // Combine all the masks into the final one.
+
+      if (c == 0xffff) {
+        // The whole input register contains valid UTF-16, i.e.,
+        // either single code units or proper surrogate pairs.
+        input += 16;
+      } else if (c == 0x7fff) {
+        // The 15 lower code units of the input register contains valid UTF-16.
+        // The 15th word may be either a low or high surrogate. It the next
+        // iteration we 1) check if the low surrogate is followed by a high
+        // one, 2) reject sole high surrogate.
+        input += 15;
+      } else {
+        return nullptr;
+      }
+    }
+  }
+
+  return input;
+}
+
+template <endianness big_endian>
+const result lsx_validate_utf16_with_errors(const char16_t *input,
+                                            size_t size) {
+  const char16_t *start = input;
+  const char16_t *end = input + size;
+
+  const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+  const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+  const auto v_fc = simd8<uint8_t>::splat(0xfc);
+  const auto v_dc = simd8<uint8_t>::splat(0xdc);
+
+  while (input + simd16<uint16_t>::SIZE * 2 < end) {
+    // 0. Load data: since the validation takes into account only higher
+    //    byte of each word, we compress the two vectors into one which
+    //    consists only the higher bytes.
+    auto in0 = simd16<uint16_t>(input);
+    auto in1 =
+        simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
+
+    if (big_endian) {
+      in0 = in0.swap_bytes();
+      in1 = in1.swap_bytes();
+    }
+
+    const auto in = simd8<uint8_t>(__lsx_vssrlni_bu_h(in1.value, in0.value, 8));
+
+    // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+    const auto surrogates_wordmask = (in & v_f8) == v_d8;
+    const uint16_t surrogates_bitmask =
+        static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
+    if (surrogates_bitmask == 0x0000) {
+      input += 16;
+    } else {
+      // 2. We have some surrogates that have to be distinguished:
+      //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+      //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+      //
+      //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+      // V - non-surrogate code units
+      //     V = not surrogates_wordmask
+      const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
+
+      // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+      const auto vH = (in & v_fc) == v_dc;
+      const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
+
+      // L - word mask for low surrogates
+      //     L = not H and surrogates_wordmask
+      const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
+
+      const uint16_t a = static_cast<uint16_t>(
+          L & (H >> 1)); // A low surrogate must be followed by high one.
+                         // (A low surrogate placed in the 7th register's word
+                         // is an exception we handle.)
+      const uint16_t b = static_cast<uint16_t>(
+          a << 1); // Just mark that the opinput - startite fact is hold,
+                   // thanks to that we have only two masks for valid case.
+      const uint16_t c = static_cast<uint16_t>(
+          V | a | b); // Combine all the masks into the final one.
+
+      if (c == 0xffff) {
+        // The whole input register contains valid UTF-16, i.e.,
+        // either single code units or proper surrogate pairs.
+        input += 16;
+      } else if (c == 0x7fff) {
+        // The 15 lower code units of the input register contains valid UTF-16.
+        // The 15th word may be either a low or high surrogate. It the next
+        // iteration we 1) check if the low surrogate is followed by a high
+        // one, 2) reject sole high surrogate.
+        input += 15;
+      } else {
+        return result(error_code::SURROGATE, input - start);
+      }
+    }
+  }
+
+  return result(error_code::SUCCESS, input - start);
+}
+/* end file src/lsx/lsx_validate_utf16.cpp */
+/* begin file src/lsx/lsx_validate_utf32le.cpp */
+
+const char32_t *lsx_validate_utf32le(const char32_t *input, size_t size) {
+  const char32_t *end = input + size;
+
+  __m128i offset = __lsx_vreplgr2vr_w(uint32_t(0xffff2000));
+  __m128i standardoffsetmax = __lsx_vreplgr2vr_w(uint32_t(0xfffff7ff));
+  __m128i standardmax = __lsx_vldi(-2288); /*0x10ffff*/
+  __m128i currentmax = __lsx_vldi(0x0);
+  __m128i currentoffsetmax = __lsx_vldi(0x0);
+
+  while (input + 4 < end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input), 0);
+    currentmax = __lsx_vmax_wu(in, currentmax);
+    // 0xD8__ + 0x2000 = 0xF8__ => 0xF8__ > 0xF7FF
+    currentoffsetmax =
+        __lsx_vmax_wu(__lsx_vadd_w(in, offset), currentoffsetmax);
+
+    input += 4;
+  }
+
+  __m128i is_zero =
+      __lsx_vxor_v(__lsx_vmax_wu(currentmax, standardmax), standardmax);
+  if (__lsx_bnz_v(is_zero)) {
+    return nullptr;
+  }
+
+  is_zero = __lsx_vxor_v(__lsx_vmax_wu(currentoffsetmax, standardoffsetmax),
+                         standardoffsetmax);
+  if (__lsx_bnz_v(is_zero)) {
+    return nullptr;
+  }
+
+  return input;
+}
+
+const result lsx_validate_utf32le_with_errors(const char32_t *input,
+                                              size_t size) {
+  const char32_t *start = input;
+  const char32_t *end = input + size;
+
+  __m128i offset = __lsx_vreplgr2vr_w(uint32_t(0xffff2000));
+  __m128i standardoffsetmax = __lsx_vreplgr2vr_w(uint32_t(0xfffff7ff));
+  __m128i standardmax = __lsx_vldi(-2288); /*0x10ffff*/
+  __m128i currentmax = __lsx_vldi(0x0);
+  __m128i currentoffsetmax = __lsx_vldi(0x0);
+
+  while (input + 4 < end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input), 0);
+    currentmax = __lsx_vmax_wu(in, currentmax);
+    currentoffsetmax =
+        __lsx_vmax_wu(__lsx_vadd_w(in, offset), currentoffsetmax);
+
+    __m128i is_zero =
+        __lsx_vxor_v(__lsx_vmax_wu(currentmax, standardmax), standardmax);
+    if (__lsx_bnz_v(is_zero)) {
+      return result(error_code::TOO_LARGE, input - start);
+    }
+
+    is_zero = __lsx_vxor_v(__lsx_vmax_wu(currentoffsetmax, standardoffsetmax),
+                           standardoffsetmax);
+    if (__lsx_bnz_v(is_zero)) {
+      return result(error_code::SURROGATE, input - start);
+    }
+
+    input += 4;
+  }
+
+  return result(error_code::SUCCESS, input - start);
+}
+/* end file src/lsx/lsx_validate_utf32le.cpp */
+
+/* begin file src/lsx/lsx_convert_latin1_to_utf8.cpp */
+/*
+  Returns a pair: the first unprocessed byte from buf and utf8_output
+  A scalar routing should carry on the conversion of the tail.
+*/
+
+std::pair<const char *, char *>
+lsx_convert_latin1_to_utf8(const char *latin1_input, size_t len,
+                           char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+  const char *end = latin1_input + len;
+
+  __m128i zero = __lsx_vldi(0);
+  // We always write 16 bytes, of which more than the first 8 bytes
+  // are valid. A safety margin of 8 is more than sufficient.
+  while (latin1_input + 16 <= end) {
+    __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(latin1_input), 0);
+    uint32_t ascii = __lsx_vpickve2gr_hu(__lsx_vmskgez_b(in8), 0);
+    if (ascii == 0xffff) { // ASCII fast path!!!!
+      __lsx_vst(in8, utf8_output, 0);
+      utf8_output += 16;
+      latin1_input += 16;
+      continue;
+    }
+    // We just fallback on UTF-16 code. This could be optimized/simplified
+    // further.
+    __m128i in16 = __lsx_vilvl_b(zero, in8);
+    // 1. prepare 2-byte values
+    // input 8-bit word : [aabb|bbbb] x 8
+    // expected output   : [1100|00aa|10bb|bbbb] x 8
+    // t0 = [0000|00aa|bbbb|bb00]
+    __m128i t0 = __lsx_vslli_h(in16, 2);
+    // t1 = [0000|00aa|0000|0000]
+    __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785));
+    // t3 = [0000|00aa|00bb|bbbb]
+    __m128i t2 = __lsx_vbitsel_v(t1, in16, __lsx_vrepli_h(0x3f));
+    // t4 = [1100|00aa|10bb|bbbb]
+    __m128i t3 = __lsx_vor_v(t2, __lsx_vreplgr2vr_h(uint16_t(0xc080)));
+    // merge ASCII and 2-byte codewords
+    __m128i one_byte_bytemask = __lsx_vsle_hu(in16, __lsx_vrepli_h(0x7F));
+    __m128i utf8_unpacked = __lsx_vbitsel_v(t3, in16, one_byte_bytemask);
+
+    const uint8_t *row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                             [lsx_1_2_utf8_bytes_mask[(ascii & 0xff)]][0];
+    __m128i shuffle = __lsx_vld(row + 1, 0);
+    __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);
+
+    // store bytes
+    __lsx_vst(utf8_packed, utf8_output, 0);
+    // adjust pointers
+    latin1_input += 8;
+    utf8_output += row[0];
+
+  } // while
+
+  return std::make_pair(latin1_input, reinterpret_cast<char *>(utf8_output));
+}
+/* end file src/lsx/lsx_convert_latin1_to_utf8.cpp */
+/* begin file src/lsx/lsx_convert_latin1_to_utf16.cpp */
+std::pair<const char *, char16_t *>
+lsx_convert_latin1_to_utf16le(const char *buf, size_t len,
+                              char16_t *utf16_output) {
+  const char *end = buf + len;
+
+  __m128i zero = __lsx_vldi(0);
+  while (buf + 16 <= end) {
+    __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
+
+    __m128i inlow = __lsx_vilvl_b(zero, in8);
+    __m128i inhigh = __lsx_vilvh_b(zero, in8);
+    __lsx_vst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    __lsx_vst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 16);
+
+    utf16_output += 16;
+    buf += 16;
+  }
+
+  return std::make_pair(buf, utf16_output);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(
-    const char16_t *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  return (options & base64_url)
-             ? compress_decode_base64<true>(output, input, length, options,
-                                            last_chunk_options)
-             : compress_decode_base64<false>(output, input, length, options,
-                                             last_chunk_options);
-}
+std::pair<const char *, char16_t *>
+lsx_convert_latin1_to_utf16be(const char *buf, size_t len,
+                              char16_t *utf16_output) {
+  const char *end = buf + len;
+  __m128i zero = __lsx_vldi(0);
+  while (buf + 16 <= end) {
+    __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
+
+    __m128i inlow = __lsx_vilvl_b(in8, zero);
+    __m128i inhigh = __lsx_vilvh_b(in8, zero);
+    __lsx_vst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    __lsx_vst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 16);
+    utf16_output += 16;
+    buf += 16;
+  }
 
-simdutf_warn_unused full_result implementation::base64_to_binary_details(
-    const char16_t *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  return (options & base64_url)
-             ? compress_decode_base64<true>(output, input, length, options,
-                                            last_chunk_options)
-             : compress_decode_base64<false>(output, input, length, options,
-                                             last_chunk_options);
+  return std::make_pair(buf, utf16_output);
 }
+/* end file src/lsx/lsx_convert_latin1_to_utf16.cpp */
+/* begin file src/lsx/lsx_convert_latin1_to_utf32.cpp */
+std::pair<const char *, char32_t *>
+lsx_convert_latin1_to_utf32(const char *buf, size_t len,
+                            char32_t *utf32_output) {
+  const char *end = buf + len;
 
-simdutf_warn_unused size_t implementation::base64_length_from_binary(
-    size_t length, base64_options options) const noexcept {
-  return scalar::base64::base64_length_from_binary(length, options);
-}
+  while (buf + 16 <= end) {
+    __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
 
-size_t implementation::binary_to_base64(const char *input, size_t length,
-                                        char *output,
-                                        base64_options options) const noexcept {
-  if (options & base64_url) {
-    return encode_base64<true>(output, input, length, options);
-  } else {
-    return encode_base64<false>(output, input, length, options);
+    __m128i zero = __lsx_vldi(0);
+    __m128i in16low = __lsx_vilvl_b(zero, in8);
+    __m128i in16high = __lsx_vilvh_b(zero, in8);
+    __m128i in32_0 = __lsx_vilvl_h(zero, in16low);
+    __m128i in32_1 = __lsx_vilvh_h(zero, in16low);
+    __m128i in32_2 = __lsx_vilvl_h(zero, in16high);
+    __m128i in32_3 = __lsx_vilvh_h(zero, in16high);
+
+    __lsx_vst(in32_0, reinterpret_cast<uint32_t *>(utf32_output), 0);
+    __lsx_vst(in32_1, reinterpret_cast<uint32_t *>(utf32_output + 4), 0);
+    __lsx_vst(in32_2, reinterpret_cast<uint32_t *>(utf32_output + 8), 0);
+    __lsx_vst(in32_3, reinterpret_cast<uint32_t *>(utf32_output + 12), 0);
+
+    utf32_output += 16;
+    buf += 16;
   }
-}
-} // namespace haswell
-} // namespace simdutf
 
-/* begin file src/simdutf/haswell/end.h */
-#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
-// nothing needed.
-#else
-SIMDUTF_UNTARGET_REGION
-#endif
+  return std::make_pair(buf, utf32_output);
+}
+/* end file src/lsx/lsx_convert_latin1_to_utf32.cpp */
 
+/* begin file src/lsx/lsx_convert_utf8_to_utf16.cpp */
+// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 16, usually 12).
+template <endianness big_endian>
+size_t convert_masked_utf8_to_utf16(const char *input,
+                                    uint64_t utf8_end_of_code_point_mask,
+                                    char16_t *&utf16_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xfff;
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it
+  // is maybe beneficial to have fast paths that depend on branch prediction but
+  // have less latency. This results in more instructions but, potentially, also
+  // higher speeds.
 
-#if SIMDUTF_GCC11ORMORE // workaround for
-                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
-SIMDUTF_POP_DISABLE_WARNINGS
-#endif // end of workaround
-/* end file src/simdutf/haswell/end.h */
-/* end file src/haswell/implementation.cpp */
-#endif
-#if SIMDUTF_IMPLEMENTATION_PPC64
-/* begin file src/ppc64/implementation.cpp */
+  // We first try a few fast paths.
+  // The obvious first test is ASCII, which actually consumes the full 16.
+  if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) {
+    // We process in chunks of 16 bytes
+    // The routine in simd.h is reused.
+    simd8<int8_t> temp{in};
+    temp.store_ascii_as_utf16<big_endian>(utf16_output);
+    utf16_output += 16; // We wrote 16 16-bit characters.
+    return 16;          // We consumed 16 bytes.
+  }
 
+  uint64_t buffer[2];
+  // 3 byte sequences are the next most common, as seen in CJK, which has long
+  // sequences of these.
+  if (input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
+    // UTF-16 code units.
+    __m128i composed = convert_utf8_3_byte_to_utf16(in);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed = lsx_swap_bytes(composed);
+    }
 
+    __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    utf16_output += 4; // We wrote 4 16-bit characters.
+    return 12;         // We consumed 12 bytes.
+  }
 
+  // 2 byte sequences occur in short bursts in languages like Greek and Russian.
+  if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xAAAA) {
+    // We want to take 6 2-byte UTF-8 code units and turn them into 6 2-byte
+    // UTF-16 code units.
+    __m128i composed = convert_utf8_2_byte_to_utf16(in);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed = lsx_swap_bytes(composed);
+    }
 
+    __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    utf16_output += 6; // We wrote 6 16-bit characters.
+    return 12;         // We consumed 12 bytes.
+  }
 
-/* begin file src/simdutf/ppc64/begin.h */
-// redefining SIMDUTF_IMPLEMENTATION to "ppc64"
-// #define SIMDUTF_IMPLEMENTATION ppc64
-/* end file src/simdutf/ppc64/begin.h */
-namespace simdutf {
-namespace ppc64 {
-namespace {
-#ifndef SIMDUTF_PPC64_H
-  #error "ppc64.h must be included"
-#endif
-using namespace simd;
+  /// We do not have a fast path available, or the fast path is unimportant, so
+  /// we fallback.
+  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][0];
 
-simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
-  // careful: 0x80 is not ascii.
-  return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere();
-}
+  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][1];
+  const __m128i zero = __lsx_vldi(0);
+  if (idx < 64) {
+    // SIX (6) input code-code units
+    // Convert to UTF-16
+    __m128i composed = convert_utf8_1_to_2_byte_to_utf16(in, idx);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed = lsx_swap_bytes(composed);
+    }
+    // Store
+    __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    utf16_output += 6; // We wrote 6 16-bit characters.
+    return consumed;
+  } else if (idx < 145) {
+    // FOUR (4) input code-code units
+    // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
+    __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+                               simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+                           0);
+    // XXX: depending on the system scalar instructions might be faster.
+    // 1 byte: 00000000 00000000 0ccccccc
+    // 2 byte: 00000000 110bbbbb 10cccccc
+    // 3 byte: 1110aaaa 10bbbbbb 10cccccc
+    sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+    __m128i perm = __lsx_vshuf_b(zero, in, sh);
+    // 1 byte: 00000000 0ccccccc
+    // 2 byte: xx0bbbbb x0cccccc
+    // 3 byte: xxbbbbbb x0cccccc
+    __m128i lowperm = __lsx_vpickev_h(perm, perm);
+    // 1 byte: 00000000 00000000
+    // 2 byte: 00000000 00000000
+    // 3 byte: 00000000 1110aaaa
+    __m128i highperm = __lsx_vpickod_h(perm, perm);
+    // 3 byte: aaaa0000 00000000
+    highperm = __lsx_vslli_h(highperm, 12);
+    // ASCII
+    // 1 byte: 00000000 0ccccccc
+    // 2+byte: 00000000 00cccccc
+    __m128i ascii = __lsx_vand_v(lowperm, __lsx_vrepli_h(0x7f));
+    // 1 byte: 00000000 00000000
+    // 2 byte: xx0bbbbb 00000000
+    // 3 byte: xxbbbbbb 00000000
+    __m128i middlebyte = __lsx_vand_v(lowperm, __lsx_vldi(-2561) /*0xFF00*/);
+    // 1 byte: 00000000 0ccccccc
+    // 2 byte: 0010bbbb bbcccccc
+    // 3 byte: 0010bbbb bbcccccc
+    __m128i composed = __lsx_vor_v(__lsx_vsrli_h(middlebyte, 2), ascii);
 
-simdutf_unused simdutf_really_inline simd8<bool>
-must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2,
-                     const simd8<uint8_t> prev3) {
-  simd8<uint8_t> is_second_byte =
-      prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
-  simd8<uint8_t> is_third_byte =
-      prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
-  simd8<uint8_t> is_fourth_byte =
-      prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
-  // Caller requires a bool (all 1's). All values resulting from the subtraction
-  // will be <= 64, so signed comparison is fine.
-  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) >
-         int8_t(0);
-}
+    __m128i v0fff = __lsx_vreplgr2vr_h(uint16_t(0xfff));
+    // aaaabbbb bbcccccc
+    composed = __lsx_vbitsel_v(highperm, composed, v0fff);
 
-simdutf_really_inline simd8<bool>
-must_be_2_3_continuation(const simd8<uint8_t> prev2,
-                         const simd8<uint8_t> prev3) {
-  simd8<uint8_t> is_third_byte =
-      prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be >= 0x80
-  simd8<uint8_t> is_fourth_byte =
-      prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be >= 0x80
-  // Caller requires a bool (all 1's). All values resulting from the subtraction
-  // will be <= 64, so signed comparison is fine.
-  return simd8<bool>(is_third_byte | is_fourth_byte);
-}
+    if (!match_system(big_endian)) {
+      composed = lsx_swap_bytes(composed);
+    }
 
-} // unnamed namespace
-} // namespace ppc64
-} // namespace simdutf
+    __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    utf16_output += 4; // We wrote 4 16-bit codepoints
+    return consumed;
+  } else if (idx < 209) {
+    // THREE (3) input code-code units
+    if (input_utf8_end_of_code_point_mask == 0x888) {
+      // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
+      // UTF-16 pairs. Generating surrogate pairs is a little tricky though, but
+      // it is easier when we can assume they are all pairs. This version does
+      // not use the LUT, but 4 byte sequences are less common and the overhead
+      // of the extra memory access is less important than the early branch
+      // overhead in shorter sequences.
 
-/* begin file src/generic/buf_block_reader.h */
-namespace simdutf {
-namespace ppc64 {
-namespace {
+      // Swap byte pairs
+      // 10dddddd 10cccccc|10bbbbbb 11110aaa
+      // 10cccccc 10dddddd|11110aaa 10bbbbbb
+      __m128i swap = lsx_swap_bytes(in);
+      // Shift left 2 bits
+      // cccccc00 dddddd00 xxxxxxxx bbbbbb00
+      __m128i shift = __lsx_vslli_b(swap, 2);
+      // Create a magic number containing the low 2 bits of the trail surrogate
+      // and all the corrections needed to create the pair. UTF-8 4b prefix   =
+      // -0x0000|0xF000 surrogate offset  = -0x0000|0x0040 (0x10000 << 6)
+      // surrogate high    = +0x0000|0xD800
+      // surrogate low     = +0xDC00|0x0000
+      // -------------------------------
+      //                   = +0xDC00|0xE7C0
+      __m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xDC00E7C0));
+      // Generate unadjusted trail surrogate minus lowest 2 bits
+      // vec(0000FF00) = __lsx_vldi(-1758)
+      // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00
+      __m128i trail =
+          __lsx_vbitsel_v(shift, swap, __lsx_vldi(-1758 /*0000FF00*/));
+      // Insert low 2 bits of trail surrogate to magic number for later
+      // 11011100 00000000 11100111 110000cc
+      __m128i magic_with_low_2 = __lsx_vor_v(__lsx_vsrli_w(shift, 30), magic);
 
-// Walks through a buffer in block-sized increments, loading the last part with
-// spaces
-template <size_t STEP_SIZE> struct buf_block_reader {
-public:
-  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
-  simdutf_really_inline size_t block_index();
-  simdutf_really_inline bool has_full_block() const;
-  simdutf_really_inline const uint8_t *full_block() const;
-  /**
-   * Get the last block, padded with spaces.
-   *
-   * There will always be a last block, with at least 1 byte, unless len == 0
-   * (in which case this function fills the buffer with spaces and returns 0. In
-   * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
-   * block with STEP_SIZE bytes and no spaces for padding.
-   *
-   * @return the number of effective characters in the last block.
-   */
-  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
-  simdutf_really_inline void advance();
+      // Generate lead surrogate
+      // xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx
+      // 000000cc ccdddddd|xxxxxxxx xxxxxxxx
+      __m128i lead = __lsx_vbitsel_v(
+          __lsx_vsrli_h(__lsx_vand_v(shift, __lsx_vldi(0x3F)), 4), swap,
+          __lsx_vrepli_h(0x3f /* 0x003f*/));
 
-private:
-  const uint8_t *buf;
-  const size_t len;
-  const size_t lenminusstep;
-  size_t idx;
-};
+      // Blend pairs
+      // __lsx_vldi(-1741) => vec(0x0000FFFF)
+      // 000000cc ccdddddd|11110aaa bbbbbb00
+      __m128i blend =
+          __lsx_vbitsel_v(lead, trail, __lsx_vldi(-1741) /* (0x0000FFFF)*4 */);
 
-// Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char *format_input_text_64(const uint8_t *text) {
-  static char *buf =
-      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
-    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+      // Add magic number to finish the result
+      // 110111CC CCDDDDDD|110110AA BBBBBBCC
+      __m128i composed = __lsx_vadd_h(blend, magic_with_low_2);
+      // Byte swap if necessary
+      if (!match_system(big_endian)) {
+        composed = lsx_swap_bytes(composed);
+      }
+      // __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+      __lsx_vst(composed, reinterpret_cast<uint16_t *>(buffer), 0);
+      std::memcpy(utf16_output, buffer, 12);
+      utf16_output += 6; // We 3 32-bit surrogate pairs.
+      return 12;         // We consumed 12 bytes.
+    }
+    // 3 1-4 byte sequences
+    __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+                               simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+                           0);
+    // 1 byte: 00000000 00000000 00000000 0ddddddd
+    // 3 byte: 00000000 00000000 110ccccc 10dddddd
+    // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
+    // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
+    sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+    __m128i perm = __lsx_vshuf_b(zero, in, sh);
+    // added to fix issue https://github.com/simdutf/simdutf/issues/514
+    // We only want to write 2 * 16-bit code units when that is actually what we
+    // have. Unfortunately, we cannot trust the input. So it is possible to get
+    // 0xff as an input byte and it should not result in a surrogate pair. We
+    // need to check for that.
+    uint32_t permbuffer[4];
+    __lsx_vst(perm, permbuffer, 0);
+    // Mask the low and middle bytes
+    // 00000000 00000000 00000000 0ddddddd
+    __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7f));
+    // Because the surrogates need more work, the high surrogate is computed
+    // first.
+    __m128i middlehigh = __lsx_vslli_w(perm, 2);
+    // 00000000 00000000 00cccccc 00000000
+    __m128i middlebyte = __lsx_vand_v(perm, __lsx_vldi(-3777) /* 0x00003F00 */);
+    // Start assembling the sequence. Since the 4th byte is in the same position
+    // as it would be in a surrogate and there is no dependency, shift left
+    // instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte:
+    // 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx
+    __m128i ab =
+        __lsx_vbitsel_v(middlehigh, perm, __lsx_vldi(-1656) /*0xFF000000*/);
+    // Top 16 bits contains the high ten bits of the surrogate pair before
+    // correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa
+    // bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction
+    __m128i v_fffc0000 = __lsx_vreplgr2vr_w(uint32_t(0xFFFC0000));
+    __m128i abc = __lsx_vbitsel_v(__lsx_vslli_w(middlebyte, 4), ab, v_fffc0000);
+    // Combine the low 6 or 7 bits by a shift right accumulate
+    // 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct
+    // 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o
+    // correction
+    __m128i composed = __lsx_vor_v(ascii, __lsx_vsrli_w(abc, 6));
+    // After this is for surrogates
+    // Blend the low and high surrogates
+    // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd
+    __m128i mixed =
+        __lsx_vbitsel_v(abc, composed, __lsx_vldi(-1741) /*0x0000FFFF*/);
+    // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits
+    // yet as 0x10000 was not subtracted from the codepoint yet. 4 byte:
+    // 11110aaa bbbbbbcc|000000cc ccdddddd
+    __m128i v_ffff03ff = __lsx_vreplgr2vr_w(uint32_t(0xFFFF03FF));
+    __m128i masked_pair = __lsx_vand_v(mixed, v_ffff03ff);
+    // Correct the remaining UTF-8 prefix, surrogate offset, and add the
+    // surrogate prefixes in one magic 16-bit addition. similar magic number but
+    // without the continue byte adjust and halfword swapped UTF-8 4b prefix   =
+    // -0xF000|0x0000 surrogate offset  = -0x0040|0x0000 (0x10000 << 6)
+    // surrogate high    = +0xD800|0x0000
+    // surrogate low     = +0x0000|0xDC00
+    // -----------------------------------
+    //                   = +0xE7C0|0xDC00
+    __m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xE7C0DC00));
+    // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete
+    __m128i surrogates = __lsx_vadd_w(masked_pair, magic);
+    // If the high bit is 1 (s32 less than zero), this needs a surrogate pair
+    __m128i is_pair = __lsx_vslt_w(perm, zero);
+    // Select either the 4 byte surrogate pair or the 2 byte solo codepoint
+    // 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd
+    // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD
+    __m128i selected = __lsx_vbitsel_v(composed, surrogates, is_pair);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      selected = lsx_swap_bytes(selected);
+    }
+    // Attempting to shuffle and store would be complex, just scalarize.
+    uint32_t buffer_tmp[4];
+    __lsx_vst(selected, buffer_tmp, 0);
+    // Test for the top bit of the surrogate mask. Remove due to issue 514
+    // const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 :
+    // 0x00800000;
+    for (size_t i = 0; i < 3; i++) {
+      // Surrogate
+      // Used to be if (buffer[i] & SURROGATE_MASK) {
+      // See discussion above.
+      // patch for issue https://github.com/simdutf/simdutf/issues/514
+      if ((permbuffer[i] & 0xf8000000) == 0xf0000000) {
+        utf16_output[0] = uint16_t(buffer_tmp[i] >> 16);
+        utf16_output[1] = uint16_t(buffer_tmp[i] & 0xFFFF);
+        utf16_output += 2;
+      } else {
+        utf16_output[0] = uint16_t(buffer_tmp[i] & 0xFFFF);
+        utf16_output++;
+      }
+    }
+    return consumed;
+  } else {
+    // here we know that there is an error but we do not handle errors
+    return 12;
   }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
 }
+/* end file src/lsx/lsx_convert_utf8_to_utf16.cpp */
+/* begin file src/lsx/lsx_convert_utf8_to_utf32.cpp */
+// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+size_t convert_masked_utf8_to_utf32(const char *input,
+                                    uint64_t utf8_end_of_code_point_mask,
+                                    char32_t *&utf32_out) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  uint32_t *&utf32_output = reinterpret_cast<uint32_t *&>(utf32_out);
+  __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xFFF;
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it
+  // is maybe beneficial to have fast paths that depend on branch prediction but
+  // have less latency. This results in more instructions but, potentially, also
+  // higher speeds.
+  //
+  // We first try a few fast paths.
+  if ((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
+    // We process in chunks of 16 bytes.
+    // use fast implementation in src/simdutf/arm64/simd.h
+    // Ideally the compiler can keep the tables in registers.
+    simd8<int8_t> temp{in};
+    temp.store_ascii_as_utf32_tbl(utf32_out);
+    utf32_output += 16; // We wrote 16 32-bit characters.
+    return 16;          // We consumed 16 bytes.
+  }
+  __m128i zero = __lsx_vldi(0);
+  if (input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
+    // UTF-32 code units. Convert to UTF-16
+    __m128i composed_utf16 = convert_utf8_3_byte_to_utf16(in);
+    __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
 
-// Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char *format_input_text(const simd8x64<uint8_t> &in) {
-  static char *buf =
-      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  in.store(reinterpret_cast<uint8_t *>(buf));
-  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
-    if (buf[i] < ' ') {
-      buf[i] = '_';
-    }
+    __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
+    utf32_output += 4; // We wrote 4 32-bit characters.
+    return 12;         // We consumed 12 bytes.
   }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
-}
+  // 2 byte sequences occur in short bursts in languages like Greek and Russian.
+  if (input_utf8_end_of_code_point_mask == 0xaaa) {
+    // We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte
+    // UTF-32 code units. Convert to UTF-16
+    __m128i composed_utf16 = convert_utf8_2_byte_to_utf16(in);
 
-simdutf_unused static char *format_mask(uint64_t mask) {
-  static char *buf = reinterpret_cast<char *>(malloc(64 + 1));
-  for (size_t i = 0; i < 64; i++) {
-    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+    __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
+    __m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16);
+
+    __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
+    __lsx_vst(utf32_high, reinterpret_cast<uint32_t *>(utf32_output), 16);
+    utf32_output += 6;
+    return 12; // We consumed 12 bytes.
   }
-  buf[64] = '\0';
-  return buf;
-}
+  /// Either no fast path or an unimportant fast path.
 
-template <size_t STEP_SIZE>
-simdutf_really_inline
-buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
-    : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
-      idx{0} {}
+  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][1];
 
-template <size_t STEP_SIZE>
-simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
-  return idx;
+  if (idx < 64) {
+    // SIX (6) input code-code units
+    // Convert to UTF-16
+    __m128i composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx);
+    __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
+    __m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16);
+
+    __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
+    __lsx_vst(utf32_high, reinterpret_cast<uint32_t *>(utf32_output), 16);
+    utf32_output += 6;
+    return consumed;
+  } else if (idx < 145) {
+    // FOUR (4) input code-code units
+    // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
+    __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+                               simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+                           0);
+    // Shuffle
+    // 1 byte: 00000000 00000000 0ccccccc
+    // 2 byte: 00000000 110bbbbb 10cccccc
+    // 3 byte: 1110aaaa 10bbbbbb 10cccccc
+    sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+    __m128i perm = __lsx_vshuf_b(zero, in, sh);
+    // Split
+    // 00000000 00000000 0ccccccc
+    __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F)); // 6 or 7 bits
+    // Note: unmasked
+    // xxxxxxxx aaaaxxxx xxxxxxxx
+    __m128i high =
+        __lsx_vsrli_w(__lsx_vand_v(perm, __lsx_vldi(0xf)), 4); // 4 bits
+    // Use 16 bit bic instead of and.
+    // The top bits will be corrected later in the bsl
+    // 00000000 10bbbbbb 00000000
+    __m128i middle =
+        __lsx_vand_v(perm, __lsx_vldi(-1758 /*0x0000FF00*/)); // 5 or 6 bits
+    // Combine low and middle with shift right accumulate
+    // 00000000 00xxbbbb bbcccccc
+    __m128i lowmid = __lsx_vor_v(ascii, __lsx_vsrli_w(middle, 2));
+    // Insert top 4 bits from high byte with bitwise select
+    // 00000000 aaaabbbb bbcccccc
+    __m128i composed =
+        __lsx_vbitsel_v(lowmid, high, __lsx_vldi(-3600 /*0x0000F000*/));
+    __lsx_vst(composed, utf32_output, 0);
+    utf32_output += 4; // We wrote 4 32-bit characters.
+    return consumed;
+  } else if (idx < 209) {
+    // THREE (3) input code-code units
+    if (input_utf8_end_of_code_point_mask == 0x888) {
+      // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
+      // UTF-32 code units. This uses the same method as the fixed 3 byte
+      // version, reversing and shift left insert. However, there is no need for
+      // a shuffle mask now, just rev16 and rev32.
+      //
+      // This version does not use the LUT, but 4 byte sequences are less common
+      // and the overhead of the extra memory access is less important than the
+      // early branch overhead in shorter sequences, so it comes last.
+
+      // Swap pairs of bytes
+      // 10dddddd|10cccccc|10bbbbbb|11110aaa
+      // 10cccccc 10dddddd|11110aaa 10bbbbbb
+      __m128i swap = lsx_swap_bytes(in);
+      // Shift left and insert
+      // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb
+      __m128i merge1 = __lsx_vbitsel_v(__lsx_vsrli_h(swap, 2), swap,
+                                       __lsx_vrepli_h(0x3f /*0x003F*/));
+      // Shift insert again
+      // xxxxxxxx xxxaaabb bbbbcccc ccdddddd
+      __m128i merge2 =
+          __lsx_vbitsel_v(__lsx_vslli_w(merge1, 12), /* merge1 << 12 */
+                          __lsx_vsrli_w(merge1, 16), /* merge1 >> 16 */
+                          __lsx_vldi(-2545));        /*0x00000FFF*/
+      // Clear the garbage
+      // 00000000 000aaabb bbbbcccc ccdddddd
+      __m128i composed = __lsx_vand_v(merge2, __lsx_vldi(-2273 /*0x1FFFFF*/));
+      // Store
+      __lsx_vst(composed, utf32_output, 0);
+      utf32_output += 3; // We wrote 3 32-bit characters.
+      return 12;         // We consumed 12 bytes.
+    }
+    // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit
+    // due to surrogates no longer being involved.
+    __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+                               simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+                           0);
+    // 1 byte: 00000000 00000000 00000000 0ddddddd
+    // 2 byte: 00000000 00000000 110ccccc 10dddddd
+    // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
+    // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
+    sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+    __m128i perm = __lsx_vshuf_b(zero, in, sh);
+
+    // Ascii
+    __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F));
+    __m128i middle = __lsx_vand_v(perm, __lsx_vldi(-3777 /*0x00003f00*/));
+    // 00000000 00000000 0000cccc ccdddddd
+    __m128i cd =
+        __lsx_vbitsel_v(__lsx_vsrli_w(middle, 2), ascii, __lsx_vrepli_w(0x3f));
+
+    __m128i correction = __lsx_vand_v(perm, __lsx_vldi(-3520 /*0x00400000*/));
+    __m128i corrected = __lsx_vadd_b(perm, __lsx_vsrli_w(correction, 1));
+    // Insert twice
+    // 00000000 000aaabb bbbbxxxx xxxxxxxx
+    __m128i corrected_srli2 =
+        __lsx_vsrli_w(__lsx_vand_v(corrected, __lsx_vrepli_b(0x7)), 2);
+    __m128i ab =
+        __lsx_vbitsel_v(corrected_srli2, corrected, __lsx_vrepli_h(0x3f));
+    ab = __lsx_vsrli_w(ab, 4);
+    // 00000000 000aaabb bbbbcccc ccdddddd
+    __m128i composed =
+        __lsx_vbitsel_v(ab, cd, __lsx_vldi(-2545 /*0x00000FFF*/));
+    // Store
+    __lsx_vst(composed, utf32_output, 0);
+    utf32_output += 3; // We wrote 3 32-bit characters.
+    return consumed;
+  } else {
+    // here we know that there is an error but we do not handle errors
+    return 12;
+  }
 }
+/* end file src/lsx/lsx_convert_utf8_to_utf32.cpp */
+/* begin file src/lsx/lsx_convert_utf8_to_latin1.cpp */
+size_t convert_masked_utf8_to_latin1(const char *input,
+                                     uint64_t utf8_end_of_code_point_mask,
+                                     char *&latin1_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
+
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xfff;
+  // Optimization note: our main path below is load-latency dependent. Thus it
+  // is maybe beneficial to have fast paths that depend on branch prediction but
+  // have less latency. This results in more instructions but, potentially, also
+  // higher speeds.
+
+  // We first try a few fast paths.
+  // The obvious first test is ASCII, which actually consumes the full 16.
+  if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) {
+    // We process in chunks of 16 bytes
+    __lsx_vst(in, reinterpret_cast<uint8_t *>(latin1_output), 0);
+    latin1_output += 16; // We wrote 16 18-bit characters.
+    return 16;           // We consumed 16 bytes.
+  }
+  /// We do not have a fast path available, or the fast path is unimportant, so
+  /// we fallback.
+  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][0];
+
+  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][1];
+  // this indicates an invalid input:
+  if (idx >= 64) {
+    return consumed;
+  }
+  // Here we should have (idx < 64), if not, there is a bug in the validation or
+  // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
+  // we process SIX (6) input code-code units. The max length in bytes of six
+  // code code units spanning between 1 and 2 bytes each is 12 bytes. Converts 6
+  // 1-2 byte UTF-8 characters to 6 UTF-16 characters. This is a relatively easy
+  // scenario we process SIX (6) input code-code units. The max length in bytes
+  // of six code code units spanning between 1 and 2 bytes each is 12 bytes.
+  __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+                             simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+                         0);
+  // Shuffle
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 110aaaaa 10bbbbbb
+  sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+  __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh);
+  // ascii mask
+  // 1 byte: 11111111 11111111
+  // 2 byte: 00000000 00000000
+  __m128i ascii_mask = __lsx_vslt_bu(perm, __lsx_vldi(0x80));
+  // utf8 mask
+  // 1 byte: 00000000 00000000
+  // 2 byte: 00111111 00111111
+  __m128i utf8_mask = __lsx_vand_v(__lsx_vsle_bu(__lsx_vldi(0x80), perm),
+                                   __lsx_vldi(0b00111111));
+  // mask
+  //  1 byte: 11111111 11111111
+  //  2 byte: 00111111 00111111
+  __m128i mask = __lsx_vor_v(utf8_mask, ascii_mask);
+
+  __m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), perm, mask);
+  // writing 8 bytes even though we only care about the first 6 bytes.
+  __m128i latin1_packed = __lsx_vpickev_b(__lsx_vldi(0), composed);
 
-template <size_t STEP_SIZE>
-simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
-  return idx < lenminusstep;
+  uint64_t buffer[2];
+  // __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+  __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(buffer), 0);
+  std::memcpy(latin1_output, buffer, 6);
+  latin1_output += 6; // We wrote 6 bytes.
+  return consumed;
 }
+/* end file src/lsx/lsx_convert_utf8_to_latin1.cpp */
 
-template <size_t STEP_SIZE>
-simdutf_really_inline const uint8_t *
-buf_block_reader<STEP_SIZE>::full_block() const {
-  return &buf[idx];
+/* begin file src/lsx/lsx_convert_utf16_to_latin1.cpp */
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+lsx_convert_utf16_to_latin1(const char16_t *buf, size_t len,
+                            char *latin1_output) {
+  const char16_t *end = buf + len;
+  while (buf + 16 <= end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+    __m128i in1 = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
+    if (!match_system(big_endian)) {
+      in = lsx_swap_bytes(in);
+      in1 = lsx_swap_bytes(in1);
+    }
+    if (__lsx_bz_v(__lsx_vpickod_b(in1, in))) {
+      // 1. pack the bytes
+      __m128i latin1_packed = __lsx_vpickev_b(in1, in);
+      // 2. store (8 bytes)
+      __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+      // 3. adjust pointers
+      buf += 16;
+      latin1_output += 16;
+    } else {
+      return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
+    }
+  } // while
+  return std::make_pair(buf, latin1_output);
 }
 
-template <size_t STEP_SIZE>
-simdutf_really_inline size_t
-buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
-  if (len == idx) {
-    return 0;
-  } // memcpy(dst, null, 0) will trigger an error with some sanitizers
-  std::memset(dst, 0x20,
-              STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
-                          // to write out 8 or 16 bytes at once.
-  std::memcpy(dst, buf + idx, len - idx);
-  return len - idx;
+template <endianness big_endian>
+std::pair<result, char *>
+lsx_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
+                                        char *latin1_output) {
+  const char16_t *start = buf;
+  const char16_t *end = buf + len;
+  while (buf + 16 <= end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+    __m128i in1 = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
+    if (!match_system(big_endian)) {
+      in = lsx_swap_bytes(in);
+      in1 = lsx_swap_bytes(in1);
+    }
+    if (__lsx_bz_v(__lsx_vpickod_b(in1, in))) {
+      // 1. pack the bytes
+      __m128i latin1_packed = __lsx_vpickev_b(in1, in);
+      // 2. store (8 bytes)
+      __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+      // 3. adjust pointers
+      buf += 16;
+      latin1_output += 16;
+    } else {
+      // Let us do a scalar fallback.
+      for (int k = 0; k < 16; k++) {
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
+        if (word <= 0xff) {
+          *latin1_output++ = char(word);
+        } else {
+          return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
+                                latin1_output);
+        }
+      }
+    }
+  } // while
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        latin1_output);
 }
+/* end file src/lsx/lsx_convert_utf16_to_latin1.cpp */
+/* begin file src/lsx/lsx_convert_utf16_to_utf8.cpp */
+/*
+    The vectorized algorithm works on single SSE register i.e., it
+    loads eight 16-bit code units.
 
-template <size_t STEP_SIZE>
-simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
-  idx += STEP_SIZE;
-}
+    We consider three cases:
+    1. an input register contains no surrogates and each value
+       is in range 0x0000 .. 0x07ff.
+    2. an input register contains no surrogates and values are
+       is in range 0x0000 .. 0xffff.
+    3. an input register contains surrogates --- i.e. codepoints
+       can have 16 or 32 bits.
 
-} // unnamed namespace
-} // namespace ppc64
-} // namespace simdutf
-/* end file src/generic/buf_block_reader.h */
-/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
-namespace simdutf {
-namespace ppc64 {
-namespace {
-namespace utf8_validation {
+    Ad 1.
 
-using namespace simd;
+    When values are less than 0x0800, it means that a 16-bit code unit
+    can be converted into: 1) single UTF8 byte (when it's an ASCII
+    char) or 2) two UTF8 bytes.
 
-simdutf_really_inline simd8<uint8_t>
-check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-  // Bit 1 = Too Long (ASCII followed by continuation)
-  // Bit 2 = Overlong 3-byte
-  // Bit 4 = Surrogate
-  // Bit 5 = Overlong 2-byte
-  // Bit 7 = Two Continuations
-  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
-                                               // 11______ 11______
-  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
-  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
-  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
-  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
-                                               // 11110100 101_____
-                                               // 11110101 1001____
-                                               // 11110101 101_____
-                                               // 1111011_ 1001____
-                                               // 1111011_ 101_____
-                                               // 11111___ 1001____
-                                               // 11111___ 101_____
-  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-  // 11110101 1000____
-  // 1111011_ 1000____
-  // 11111___ 1000____
-  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+    For this case we do only some shuffle to obtain these 2-byte
+    codes and finally compress the whole SSE register with a single
+    shuffle.
 
-  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
-  constexpr const uint8_t CARRY =
-      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
-  const simd8<uint8_t> byte_1_low =
-      (prev1 & 0x0F)
-          .lookup_16<uint8_t>(
-              // ____0000 ________
-              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-              // ____0001 ________
-              CARRY | OVERLONG_2,
-              // ____001_ ________
-              CARRY, CARRY,
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
 
-              // ____0100 ________
-              CARRY | TOO_LARGE,
-              // ____0101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____011_ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
+    Ad 2.
 
-              // ____1___ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____1101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000);
-  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT,
+    When values fit in 16-bit code units, but are above 0x07ff, then
+    a single word may produce one, two or three UTF8 bytes.
 
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
-          OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+    We prepare data for all these three cases in two registers.
+    The first register contains lower two UTF8 bytes (used in all
+    cases), while the second one contains just the third byte for
+    the three-UTF8-bytes case.
 
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
-  return (byte_1_high & byte_1_low & byte_2_high);
-}
-simdutf_really_inline simd8<uint8_t>
-check_multibyte_lengths(const simd8<uint8_t> input,
-                        const simd8<uint8_t> prev_input,
-                        const simd8<uint8_t> sc) {
-  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
-  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
-  simd8<uint8_t> must23 =
-      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
-  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
-  return must23_80 ^ sc;
-}
+    Finally these two registers are interleaved forming eight-element
+    array of 32-bit values. The array spans two SSE registers.
+    The bytes from the registers are compressed using two shuffles.
 
-//
-// Return nonzero if there are incomplete multibyte characters at the end of the
-// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
-//
-simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
-  // If the previous input's last 3 bytes match this, they're too short (they
-  // ended at EOF):
-  // ... 1111____ 111_____ 11______
-  static const uint8_t max_array[32] = {255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        0b11110000u - 1,
-                                        0b11100000u - 1,
-                                        0b11000000u - 1};
-  const simd8<uint8_t> max_value(
-      &max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
-  return input.gt_bits(max_value);
-}
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
 
-struct utf8_checker {
-  // If this is nonzero, there has been a UTF-8 error.
-  simd8<uint8_t> error;
-  // The last input we received
-  simd8<uint8_t> prev_input_block;
-  // Whether the last input we received was incomplete (used for ASCII fast
-  // path)
-  simd8<uint8_t> prev_incomplete;
 
-  //
-  // Check whether the current bytes are valid UTF-8.
-  //
-  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
-                                              const simd8<uint8_t> prev_input) {
-    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
-    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
-    // small negative numbers)
-    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-    simd8<uint8_t> sc = check_special_cases(input, prev1);
-    this->error |= check_multibyte_lengths(input, prev_input, sc);
-  }
+    To summarize:
+    - We need two 256-entry tables that have 8704 bytes in total.
+*/
+/*
+  Returns a pair: the first unprocessed byte from buf and utf8_output
+  A scalar routing should carry on the conversion of the tail.
+*/
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+lsx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+  const char16_t *end = buf + len;
 
-  // The only problem that can happen at EOF is that a multibyte character is
-  // too short or a byte value too large in the last bytes: check_special_cases
-  // only checks for bytes too large in the first of two bytes.
-  simdutf_really_inline void check_eof() {
-    // If the previous block had incomplete UTF-8 characters at the end, an
-    // ASCII block can't possibly finish them.
-    this->error |= this->prev_incomplete;
-  }
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
 
-  simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
-    if (simdutf_likely(is_ascii(input))) {
-      this->error |= this->prev_incomplete;
-    } else {
-      // you might think that a for-loop would work, but under Visual Studio, it
-      // is not good enough.
-      static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                        (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                    "We support either two or four chunks per 64-byte block.");
-      if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-      } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-        this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+  __m128i v_07ff = __lsx_vreplgr2vr_h(uint16_t(0x7ff));
+  while (buf + 16 + safety_margin <= end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+    if (!match_system(big_endian)) {
+      in = lsx_swap_bytes(in);
+    }
+    if (__lsx_bz_v(
+            __lsx_vslt_hu(__lsx_vrepli_h(0x7F), in))) { // ASCII fast path!!!!
+      // It is common enough that we have sequences of 16 consecutive ASCII
+      // characters.
+      __m128i nextin = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
+      if (!match_system(big_endian)) {
+        nextin = lsx_swap_bytes(nextin);
+      }
+      if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F), nextin))) {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        __m128i utf8_packed = __lsx_vpickev_b(nextin, in);
+        // 2. store (16 bytes)
+        __lsx_vst(utf8_packed, utf8_output, 0);
+        // 3. adjust pointers
+        buf += 16;
+        utf8_output += 16;
+        continue; // we are done for this round!
+      } else {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        __m128i utf8_packed = __lsx_vpickev_b(in, in);
+        // 2. store (8 bytes)
+        __lsx_vst(utf8_packed, utf8_output, 0);
+        // 3. adjust pointers
+        buf += 8;
+        utf8_output += 8;
+        in = nextin;
       }
-      this->prev_incomplete =
-          is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
-      this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
     }
-  }
 
-  // do not forget to call check_eof!
-  simdutf_really_inline bool errors() const {
-    return this->error.any_bits_set_anywhere();
-  }
+    __m128i zero = __lsx_vldi(0);
+    if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, in))) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      // t0 = [000a|aaaa|bbbb|bb00]
+      __m128i t0 = __lsx_vslli_h(in, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785 /*0x1f00*/));
+      // t2 = [0000|0000|00bb|bbbb]
+      __m128i t2 = __lsx_vand_v(in, __lsx_vrepli_h(0x3f));
+      // t3 = [000a|aaaa|00bb|bbbb]
+      __m128i t3 = __lsx_vor_v(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      __m128i v_c080 = __lsx_vreplgr2vr_h(uint16_t(0xc080));
+      __m128i t4 = __lsx_vor_v(t3, v_c080);
+      // 2. merge ASCII and 2-byte codewords
+      __m128i one_byte_bytemask =
+          __lsx_vsle_hu(in, __lsx_vrepli_h(0x7F /*0x007F*/));
+      __m128i utf8_unpacked = __lsx_vbitsel_v(t4, in, one_byte_bytemask);
+      // 3. prepare bitmask for 8-bit lookup
+      uint32_t m2 = __lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0);
+      // 4. pack the bytes
+      const uint8_t *row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                               [lsx_1_2_utf8_bytes_mask[m2]][0];
+      __m128i shuffle = __lsx_vld(row, 1);
+      __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);
+      // 5. store bytes
+      __lsx_vst(utf8_packed, utf8_output, 0);
+      // 6. adjust pointers
+      buf += 8;
+      utf8_output += row[0];
+      continue;
+    }
+    __m128i surrogates_bytemask =
+        __lsx_vseq_h(__lsx_vand_v(in, __lsx_vldi(-2568 /*0xF800*/)),
+                     __lsx_vldi(-2600 /*0xD800*/));
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+    // However, it is likely an uncommon occurrence.
+    if (__lsx_bz_v(surrogates_bytemask)) {
+      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+      /* In this branch we handle three cases:
+           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+         single UFT-8 byte
+           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
+         two UTF-8 bytes
+           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+         three UTF-8 bytes
 
-}; // struct utf8_checker
-} // namespace utf8_validation
+          We expand the input word (16-bit) into two code units (32-bit), thus
+          we have room for four bytes. However, we need five distinct bit
+          layouts. Note that the last byte in cases #2 and #3 is the same.
 
-using utf8_validation::utf8_checker;
+          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+          in register t2.
 
-} // unnamed namespace
-} // namespace ppc64
-} // namespace simdutf
-/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
-/* begin file src/generic/utf8_validation/utf8_validator.h */
-namespace simdutf {
-namespace ppc64 {
-namespace {
-namespace utf8_validation {
+          We precompute byte 1 for case #3 and -- **conditionally** --
+         precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+         they differ by exactly one bit.
 
-/**
- * Validates that the string is actual UTF-8.
- */
-template <class checker>
-bool generic_validate_utf8(const uint8_t *input, size_t length) {
-  checker c{};
-  buf_block_reader<64> reader(input, length);
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
-    c.check_next_input(in);
-    reader.advance();
-  }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  c.check_next_input(in);
-  reader.advance();
-  c.check_eof();
-  return !c.errors();
-}
+          Finally from these two code units we build proper UTF-8 sequence,
+         taking into account the case (i.e, the number of bytes to write).
+        */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      __m128i t0 = __lsx_vpickev_b(in, in);
+      t0 = __lsx_vilvl_b(t0, t0);
 
-bool generic_validate_utf8(const char *input, size_t length) {
-  return generic_validate_utf8<utf8_checker>(
-      reinterpret_cast<const uint8_t *>(input), length);
-}
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|00cc|cccc]
+      __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F));
+      __m128i t1 = __lsx_vand_v(t0, v_3f7f);
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      __m128i t2 = __lsx_vor_v(t1, __lsx_vldi(-2688 /*0x8000*/));
 
-/**
- * Validates that the string is actual UTF-8 and stops on errors.
- */
-template <class checker>
-result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
-  checker c{};
-  buf_block_reader<64> reader(input, length);
-  size_t count{0};
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
-    c.check_next_input(in);
-    if (c.errors()) {
-      if (count != 0) {
-        count--;
-      } // Sometimes the error is only detected in the next chunk
-      result res = scalar::utf8::rewind_and_validate_with_errors(
-          reinterpret_cast<const char *>(input),
-          reinterpret_cast<const char *>(input + count), length - count);
-      res.count += count;
-      return res;
-    }
-    reader.advance();
-    count += 64;
-  }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  c.check_next_input(in);
-  reader.advance();
-  c.check_eof();
-  if (c.errors()) {
-    if (count != 0) {
-      count--;
-    } // Sometimes the error is only detected in the next chunk
-    result res = scalar::utf8::rewind_and_validate_with_errors(
-        reinterpret_cast<const char *>(input),
-        reinterpret_cast<const char *>(input) + count, length - count);
-    res.count += count;
-    return res;
-  } else {
-    return result(error_code::SUCCESS, length);
-  }
-}
+      // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+      __m128i s0 = __lsx_vsrli_h(in, 12);
+      // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+      __m128i s1 = __lsx_vslli_h(in, 2);
+      // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000]
+      s1 = __lsx_vand_v(s1, __lsx_vldi(-2753 /*0x3F00*/));
 
-result generic_validate_utf8_with_errors(const char *input, size_t length) {
-  return generic_validate_utf8_with_errors<utf8_checker>(
-      reinterpret_cast<const uint8_t *>(input), length);
+      // [00bb|bbbb|0000|aaaa]
+      __m128i s2 = __lsx_vor_v(s0, s1);
+      // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0));
+      __m128i s3 = __lsx_vor_v(s2, v_c0e0);
+      __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(in, v_07ff);
+      __m128i m0 = __lsx_vandn_v(one_or_two_bytes_bytemask,
+                                 __lsx_vldi(-2752 /*0x4000*/));
+      __m128i s4 = __lsx_vxor_v(s3, m0);
+
+      // 4. expand code units 16-bit => 32-bit
+      __m128i out0 = __lsx_vilvl_h(s4, t2);
+      __m128i out1 = __lsx_vilvh_h(s4, t2);
+
+      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+      __m128i one_byte_bytemask = __lsx_vsle_hu(in, __lsx_vrepli_h(0x7F));
+
+      __m128i one_or_two_bytes_bytemask_low =
+          __lsx_vilvl_h(one_or_two_bytes_bytemask, zero);
+      __m128i one_or_two_bytes_bytemask_high =
+          __lsx_vilvh_h(one_or_two_bytes_bytemask, zero);
+
+      __m128i one_byte_bytemask_low =
+          __lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask);
+      __m128i one_byte_bytemask_high =
+          __lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+      const uint32_t mask0 = __lsx_vpickve2gr_bu(
+          __lsx_vmskltz_h(__lsx_vor_v(one_or_two_bytes_bytemask_low,
+                                      one_byte_bytemask_low)),
+          0);
+      const uint32_t mask1 = __lsx_vpickve2gr_bu(
+          __lsx_vmskltz_h(__lsx_vor_v(one_or_two_bytes_bytemask_high,
+                                      one_byte_bytemask_high)),
+          0);
+
+      const uint8_t *row0 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      __m128i shuffle0 = __lsx_vld(row0, 1);
+      __m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0);
+
+      const uint8_t *row1 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      __m128i shuffle1 = __lsx_vld(row1, 1);
+      __m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1);
+
+      __lsx_vst(utf8_0, utf8_output, 0);
+      utf8_output += row0[0];
+      __lsx_vst(utf8_1, utf8_output, 0);
+      utf8_output += row1[0];
+
+      buf += 8;
+      // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
+        if ((word & 0xFF80) == 0) {
+          *utf8_output++ = char(word);
+        } else if ((word & 0xF800) == 0) {
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xF800) != 0xD800) {
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char *>(utf8_output));
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf8_output++ = char((value >> 18) | 0b11110000);
+          *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
+    }
+  } // while
+  return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
 }
 
-template <class checker>
-bool generic_validate_ascii(const uint8_t *input, size_t length) {
-  buf_block_reader<64> reader(input, length);
-  uint8_t blocks[64]{};
-  simd::simd8x64<uint8_t> running_or(blocks);
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
-    running_or |= in;
-    reader.advance();
-  }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  running_or |= in;
-  return running_or.is_ascii();
-}
+/*
+  Returns a pair: a result struct and utf8_output.
+  If there is an error, the count field of the result is the position of the
+  error. Otherwise, it is the position of the first unprocessed byte in buf
+  (even if finished). A scalar routing should carry on the conversion of the
+  tail if needed.
+*/
+template <endianness big_endian>
+std::pair<result, char *>
+lsx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
+                                      char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+  const char16_t *start = buf;
+  const char16_t *end = buf + len;
 
-bool generic_validate_ascii(const char *input, size_t length) {
-  return generic_validate_ascii<utf8_checker>(
-      reinterpret_cast<const uint8_t *>(input), length);
-}
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+  while (buf + 16 + safety_margin <= end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+    if (!match_system(big_endian)) {
+      in = lsx_swap_bytes(in);
+    }
+    if (__lsx_bz_v(
+            __lsx_vslt_hu(__lsx_vrepli_h(0x7F), in))) { // ASCII fast path!!!!
+      // It is common enough that we have sequences of 16 consecutive ASCII
+      // characters.
+      __m128i nextin = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
+      if (!match_system(big_endian)) {
+        nextin = lsx_swap_bytes(nextin);
+      }
+      if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F), nextin))) {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        __m128i utf8_packed = __lsx_vpickev_b(nextin, in);
+        // 2. store (16 bytes)
+        __lsx_vst(utf8_packed, utf8_output, 0);
+        // 3. adjust pointers
+        buf += 16;
+        utf8_output += 16;
+        continue; // we are done for this round!
+      } else {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        __m128i utf8_packed = __lsx_vpickev_b(in, in);
+        // 2. store (8 bytes)
+        __lsx_vst(utf8_packed, utf8_output, 0);
+        // 3. adjust pointers
+        buf += 8;
+        utf8_output += 8;
+        in = nextin;
+      }
+    }
 
-template <class checker>
-result generic_validate_ascii_with_errors(const uint8_t *input, size_t length) {
-  buf_block_reader<64> reader(input, length);
-  size_t count{0};
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
-    if (!in.is_ascii()) {
-      result res = scalar::ascii::validate_with_errors(
-          reinterpret_cast<const char *>(input + count), length - count);
-      return result(res.error, count + res.count);
+    __m128i v_07ff = __lsx_vreplgr2vr_h(uint16_t(0x7ff));
+    __m128i zero = __lsx_vldi(0);
+    if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, in))) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      // t0 = [000a|aaaa|bbbb|bb00]
+      __m128i t0 = __lsx_vslli_h(in, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785 /*0x1f00*/));
+      // t2 = [0000|0000|00bb|bbbb]
+      __m128i t2 = __lsx_vand_v(in, __lsx_vrepli_h(0x3f));
+      // t3 = [000a|aaaa|00bb|bbbb]
+      __m128i t3 = __lsx_vor_v(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      __m128i v_c080 = __lsx_vreplgr2vr_h(uint16_t(0xc080));
+      __m128i t4 = __lsx_vor_v(t3, v_c080);
+      // 2. merge ASCII and 2-byte codewords
+      __m128i one_byte_bytemask =
+          __lsx_vsle_hu(in, __lsx_vrepli_h(0x7F /*0x007F*/));
+      __m128i utf8_unpacked = __lsx_vbitsel_v(t4, in, one_byte_bytemask);
+      // 3. prepare bitmask for 8-bit lookup
+      uint32_t m2 = __lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0);
+      // 4. pack the bytes
+      const uint8_t *row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                               [lsx_1_2_utf8_bytes_mask[m2]][0];
+      __m128i shuffle = __lsx_vld(row, 1);
+      __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);
+      // 5. store bytes
+      __lsx_vst(utf8_packed, utf8_output, 0);
+      // 6. adjust pointers
+      buf += 8;
+      utf8_output += row[0];
+      continue;
     }
-    reader.advance();
+    __m128i surrogates_bytemask =
+        __lsx_vseq_h(__lsx_vand_v(in, __lsx_vldi(-2568 /*0xF800*/)),
+                     __lsx_vldi(-2600 /*0xD800*/));
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+    // However, it is likely an uncommon occurrence.
+    if (__lsx_bz_v(surrogates_bytemask)) {
+      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+      /* In this branch we handle three cases:
+           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+         single UFT-8 byte
+           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
+         two UTF-8 bytes
+           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+         three UTF-8 bytes
 
-    count += 64;
-  }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  if (!in.is_ascii()) {
-    result res = scalar::ascii::validate_with_errors(
-        reinterpret_cast<const char *>(input + count), length - count);
-    return result(res.error, count + res.count);
-  } else {
-    return result(error_code::SUCCESS, length);
-  }
-}
+          We expand the input word (16-bit) into two code units (32-bit), thus
+          we have room for four bytes. However, we need five distinct bit
+          layouts. Note that the last byte in cases #2 and #3 is the same.
 
-result generic_validate_ascii_with_errors(const char *input, size_t length) {
-  return generic_validate_ascii_with_errors<utf8_checker>(
-      reinterpret_cast<const uint8_t *>(input), length);
-}
+          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+          in register t2.
 
-} // namespace utf8_validation
-} // unnamed namespace
-} // namespace ppc64
-} // namespace simdutf
-/* end file src/generic/utf8_validation/utf8_validator.h */
-// transcoding from UTF-8 to UTF-16
-/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+          We precompute byte 1 for case #3 and -- **conditionally** --
+         precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+         they differ by exactly one bit.
 
-namespace simdutf {
-namespace ppc64 {
-namespace {
-namespace utf8_to_utf16 {
-using namespace simd;
+          Finally from these two code units we build proper UTF-8 sequence,
+         taking into account the case (i.e, the number of bytes to write).
+        */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      __m128i t0 = __lsx_vpickev_b(in, in);
+      t0 = __lsx_vilvl_b(t0, t0);
 
-simdutf_really_inline simd8<uint8_t>
-check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-  // Bit 1 = Too Long (ASCII followed by continuation)
-  // Bit 2 = Overlong 3-byte
-  // Bit 4 = Surrogate
-  // Bit 5 = Overlong 2-byte
-  // Bit 7 = Two Continuations
-  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
-                                               // 11______ 11______
-  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
-  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
-  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
-  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
-                                               // 11110100 101_____
-                                               // 11110101 1001____
-                                               // 11110101 101_____
-                                               // 1111011_ 1001____
-                                               // 1111011_ 101_____
-                                               // 11111___ 1001____
-                                               // 11111___ 101_____
-  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-  // 11110101 1000____
-  // 1111011_ 1000____
-  // 11111___ 1000____
-  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|00cc|cccc]
+      __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F));
+      __m128i t1 = __lsx_vand_v(t0, v_3f7f);
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      __m128i t2 = __lsx_vor_v(t1, __lsx_vldi(-2688));
 
-  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
-  constexpr const uint8_t CARRY =
-      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
-  const simd8<uint8_t> byte_1_low =
-      (prev1 & 0x0F)
-          .lookup_16<uint8_t>(
-              // ____0000 ________
-              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-              // ____0001 ________
-              CARRY | OVERLONG_2,
-              // ____001_ ________
-              CARRY, CARRY,
+      // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+      __m128i s0 = __lsx_vsrli_h(in, 12);
+      // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+      __m128i s1 = __lsx_vslli_h(in, 2);
+      // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000]
+      s1 = __lsx_vand_v(s1, __lsx_vldi(-2753 /*0x3F00*/));
 
-              // ____0100 ________
-              CARRY | TOO_LARGE,
-              // ____0101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____011_ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // [00bb|bbbb|0000|aaaa]
+      __m128i s2 = __lsx_vor_v(s0, s1);
+      // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0));
+      __m128i s3 = __lsx_vor_v(s2, v_c0e0);
+      __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(in, v_07ff);
+      __m128i m0 = __lsx_vandn_v(one_or_two_bytes_bytemask,
+                                 __lsx_vldi(-2752 /*0x4000*/));
+      __m128i s4 = __lsx_vxor_v(s3, m0);
 
-              // ____1___ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____1101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000);
-  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT,
+      // 4. expand code units 16-bit => 32-bit
+      __m128i out0 = __lsx_vilvl_h(s4, t2);
+      __m128i out1 = __lsx_vilvh_h(s4, t2);
 
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
-          OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+      __m128i one_byte_bytemask = __lsx_vsle_hu(in, __lsx_vrepli_h(0x7F));
+
+      __m128i one_or_two_bytes_bytemask_low =
+          __lsx_vilvl_h(one_or_two_bytes_bytemask, zero);
+      __m128i one_or_two_bytes_bytemask_high =
+          __lsx_vilvh_h(one_or_two_bytes_bytemask, zero);
+
+      __m128i one_byte_bytemask_low =
+          __lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask);
+      __m128i one_byte_bytemask_high =
+          __lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+      const uint32_t mask0 = __lsx_vpickve2gr_bu(
+          __lsx_vmskltz_h(__lsx_vor_v(one_or_two_bytes_bytemask_low,
+                                      one_byte_bytemask_low)),
+          0);
+      const uint32_t mask1 = __lsx_vpickve2gr_bu(
+          __lsx_vmskltz_h(__lsx_vor_v(one_or_two_bytes_bytemask_high,
+                                      one_byte_bytemask_high)),
+          0);
 
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
-  return (byte_1_high & byte_1_low & byte_2_high);
-}
-simdutf_really_inline simd8<uint8_t>
-check_multibyte_lengths(const simd8<uint8_t> input,
-                        const simd8<uint8_t> prev_input,
-                        const simd8<uint8_t> sc) {
-  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
-  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
-  simd8<uint8_t> must23 =
-      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
-  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
-  return must23_80 ^ sc;
-}
+      const uint8_t *row0 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      __m128i shuffle0 = __lsx_vld(row0, 1);
+      __m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0);
 
-struct validating_transcoder {
-  // If this is nonzero, there has been a UTF-8 error.
-  simd8<uint8_t> error;
+      const uint8_t *row1 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      __m128i shuffle1 = __lsx_vld(row1, 1);
+      __m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1);
 
-  validating_transcoder() : error(uint8_t(0)) {}
-  //
-  // Check whether the current bytes are valid UTF-8.
-  //
-  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
-                                              const simd8<uint8_t> prev_input) {
-    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
-    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
-    // small negative numbers)
-    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-    simd8<uint8_t> sc = check_special_cases(input, prev1);
-    this->error |= check_multibyte_lengths(input, prev_input, sc);
-  }
+      __lsx_vst(utf8_0, utf8_output, 0);
+      utf8_output += row0[0];
+      __lsx_vst(utf8_1, utf8_output, 0);
+      utf8_output += row1[0];
 
-  template <endianness endian>
-  simdutf_really_inline size_t convert(const char *in, size_t size,
-                                       char16_t *utf16_output) {
-    size_t pos = 0;
-    char16_t *start{utf16_output};
-    // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
-    // last 16 bytes, and if the data is valid, then it is entirely safe because
-    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
-    // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 8 leading bytes, to give us a good margin.
-    size_t leading_byte = 0;
-    size_t margin = size;
-    for (; margin > 0 && leading_byte < 8; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) > -65);
-    }
-    // If the input is long enough, then we have that margin-1 is the eight last
-    // leading byte.
-    const size_t safety_margin = size - margin + 1; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      if (input.is_ascii()) {
-        input.store_ascii_as_utf16<endian>(utf16_output);
-        utf16_output += 64;
-        pos += 64;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio,
-        // it is not good enough.
-        static_assert(
-            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        auto zero = simd8<uint8_t>{uint8_t(0)};
-        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-        }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        if (utf8_continuation_mask & 1) {
-          return 0; // error
-        }
-        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-        // We process in blocks of up to 12 bytes except possibly
-        // for fast paths which may process up to 16 bytes. For the
-        // slow path to work, we should have at least 12 input bytes left.
-        size_t max_starting_point = (pos + 64) - 12;
-        // Next loop is going to run at least five times.
-        while (pos < max_starting_point) {
-          // Performance note: our ability to compute 'consumed' and
-          // then shift and recompute is critical. If there is a
-          // latency of, say, 4 cycles on getting 'consumed', then
-          // the inner loop might have a total latency of about 6 cycles.
-          // Yet we process between 6 to 12 inputs bytes, thus we get
-          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-          // for this section of the code. Hence, there is a limit
-          // to how much we can further increase this latency before
-          // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf16<endian>(
-              in + pos, utf8_end_of_code_point_mask, utf16_output);
-          pos += consumed;
-          utf8_end_of_code_point_mask >>= consumed;
-        }
-        // At this point there may remain between 0 and 12 bytes in the
-        // 64-byte block. These bytes will be processed again. So we have an
-        // 80% efficiency (in the worst case). In practice we expect an
-        // 85% to 90% efficiency.
+      buf += 8;
+      // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
       }
-    }
-    if (errors()) {
-      return 0;
-    }
-    if (pos < size) {
-      size_t howmany = scalar::utf8_to_utf16::convert<endian>(
-          in + pos, size - pos, utf16_output);
-      if (howmany == 0) {
-        return 0;
+      for (; k < forward; k++) {
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
+        if ((word & 0xFF80) == 0) {
+          *utf8_output++ = char(word);
+        } else if ((word & 0xF800) == 0) {
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xF800) != 0xD800) {
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k - 1),
+                reinterpret_cast<char *>(utf8_output));
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf8_output++ = char((value >> 18) | 0b11110000);
+          *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+        }
       }
-      utf16_output += howmany;
+      buf += k;
     }
-    return utf16_output - start;
-  }
+  } // while
 
-  template <endianness endian>
-  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
-                                                   char16_t *utf16_output) {
-    size_t pos = 0;
-    char16_t *start{utf16_output};
-    // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
-    // last 16 bytes, and if the data is valid, then it is entirely safe because
-    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
-    // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 8 leading bytes, to give us a good margin.
-    size_t leading_byte = 0;
-    size_t margin = size;
-    for (; margin > 0 && leading_byte < 8; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) > -65);
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char *>(utf8_output));
+}
+/* end file src/lsx/lsx_convert_utf16_to_utf8.cpp */
+/* begin file src/lsx/lsx_convert_utf16_to_utf32.cpp */
+template <endianness big_endian>
+std::pair<const char16_t *, char32_t *>
+lsx_convert_utf16_to_utf32(const char16_t *buf, size_t len,
+                           char32_t *utf32_out) {
+  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
+  const char16_t *end = buf + len;
+
+  __m128i zero = __lsx_vldi(0);
+  __m128i v_f800 = __lsx_vldi(-2568); /*0xF800*/
+  __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
+
+  while (buf + 8 <= end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+    if (!match_system(big_endian)) {
+      in = lsx_swap_bytes(in);
     }
-    // If the input is long enough, then we have that margin-1 is the eight last
-    // leading byte.
-    const size_t safety_margin = size - margin + 1; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      if (input.is_ascii()) {
-        input.store_ascii_as_utf16<endian>(utf16_output);
-        utf16_output += 64;
-        pos += 64;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio,
-        // it is not good enough.
-        static_assert(
-            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        auto zero = simd8<uint8_t>{uint8_t(0)};
-        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-        }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        if (errors() || (utf8_continuation_mask & 1)) {
-          // rewind_and_convert_with_errors will seek a potential error from
-          // in+pos onward, with the ability to go back up to pos bytes, and
-          // read size-pos bytes forward.
-          result res =
-              scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
-                  pos, in + pos, size - pos, utf16_output);
-          res.count += pos;
-          return res;
-        }
-        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-        // We process in blocks of up to 12 bytes except possibly
-        // for fast paths which may process up to 16 bytes. For the
-        // slow path to work, we should have at least 12 input bytes left.
-        size_t max_starting_point = (pos + 64) - 12;
-        // Next loop is going to run at least five times.
-        while (pos < max_starting_point) {
-          // Performance note: our ability to compute 'consumed' and
-          // then shift and recompute is critical. If there is a
-          // latency of, say, 4 cycles on getting 'consumed', then
-          // the inner loop might have a total latency of about 6 cycles.
-          // Yet we process between 6 to 12 inputs bytes, thus we get
-          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-          // for this section of the code. Hence, there is a limit
-          // to how much we can further increase this latency before
-          // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf16<endian>(
-              in + pos, utf8_end_of_code_point_mask, utf16_output);
-          pos += consumed;
-          utf8_end_of_code_point_mask >>= consumed;
+
+    __m128i surrogates_bytemask =
+        __lsx_vseq_h(__lsx_vand_v(in, v_f800), v_d800);
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+    // However, it is likely an uncommon occurrence.
+    if (__lsx_bz_v(surrogates_bytemask)) {
+      // case: no surrogate pairs, extend all 16-bit code units to 32-bit code
+      // units
+      __lsx_vst(__lsx_vilvl_h(zero, in), utf32_output, 0);
+      __lsx_vst(__lsx_vilvh_h(zero, in), utf32_output, 16);
+      utf32_output += 8;
+      buf += 8;
+      // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
+        if ((word & 0xF800) != 0xD800) {
+          *utf32_output++ = char32_t(word);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char32_t *>(utf32_output));
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf32_output++ = char32_t(value);
         }
-        // At this point there may remain between 0 and 12 bytes in the
-        // 64-byte block. These bytes will be processed again. So we have an
-        // 80% efficiency (in the worst case). In practice we expect an
-        // 85% to 90% efficiency.
       }
+      buf += k;
     }
-    if (errors()) {
-      // rewind_and_convert_with_errors will seek a potential error from in+pos
-      // onward, with the ability to go back up to pos bytes, and read size-pos
-      // bytes forward.
-      result res =
-          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
-              pos, in + pos, size - pos, utf16_output);
-      res.count += pos;
-      return res;
+  } // while
+  return std::make_pair(buf, reinterpret_cast<char32_t *>(utf32_output));
+}
+
+/*
+  Returns a pair: a result struct and utf8_output.
+  If there is an error, the count field of the result is the position of the
+  error. Otherwise, it is the position of the first unprocessed byte in buf
+  (even if finished). A scalar routing should carry on the conversion of the
+  tail if needed.
+*/
+template <endianness big_endian>
+std::pair<result, char32_t *>
+lsx_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
+                                       char32_t *utf32_out) {
+  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
+  const char16_t *start = buf;
+  const char16_t *end = buf + len;
+
+  __m128i zero = __lsx_vldi(0);
+  __m128i v_f800 = __lsx_vldi(-2568); /*0xF800*/
+  __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
+
+  while (buf + 8 <= end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+    if (!match_system(big_endian)) {
+      in = lsx_swap_bytes(in);
     }
-    if (pos < size) {
-      // rewind_and_convert_with_errors will seek a potential error from in+pos
-      // onward, with the ability to go back up to pos bytes, and read size-pos
-      // bytes forward.
-      result res =
-          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
-              pos, in + pos, size - pos, utf16_output);
-      if (res.error) { // In case of error, we want the error position
-        res.count += pos;
-        return res;
-      } else { // In case of success, we want the number of word written
-        utf16_output += res.count;
+
+    __m128i surrogates_bytemask =
+        __lsx_vseq_h(__lsx_vand_v(in, v_f800), v_d800);
+    if (__lsx_bz_v(surrogates_bytemask)) {
+      // case: no surrogate pairs, extend all 16-bit code units to 32-bit code
+      // units
+      __lsx_vst(__lsx_vilvl_h(zero, in), utf32_output, 0);
+      __lsx_vst(__lsx_vilvh_h(zero, in), utf32_output, 16);
+      utf32_output += 8;
+      buf += 8;
+      // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
+        if ((word & 0xF800) != 0xD800) {
+          *utf32_output++ = char32_t(word);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k - 1),
+                reinterpret_cast<char32_t *>(utf32_output));
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf32_output++ = char32_t(value);
+        }
       }
+      buf += k;
     }
-    return result(error_code::SUCCESS, utf16_output - start);
-  }
+  } // while
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char32_t *>(utf32_output));
+}
+/* end file src/lsx/lsx_convert_utf16_to_utf32.cpp */
 
-  simdutf_really_inline bool errors() const {
-    return this->error.any_bits_set_anywhere();
-  }
+/* begin file src/lsx/lsx_convert_utf32_to_latin1.cpp */
+std::pair<const char32_t *, char *>
+lsx_convert_utf32_to_latin1(const char32_t *buf, size_t len,
+                            char *latin1_output) {
+  const char32_t *end = buf + len;
+  const v16u8 shuf_mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
+  __m128i v_ff = __lsx_vrepli_w(0xFF);
 
-}; // struct utf8_checker
-} // namespace utf8_to_utf16
-} // unnamed namespace
-} // namespace ppc64
-} // namespace simdutf
-/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
-/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+  while (buf + 16 <= end) {
+    __m128i in1 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m128i in2 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
 
-namespace simdutf {
-namespace ppc64 {
-namespace {
-namespace utf8_to_utf16 {
+    __m128i in12 = __lsx_vor_v(in1, in2);
+    if (__lsx_bz_v(__lsx_vslt_wu(v_ff, in12))) {
+      // 1. pack the bytes
+      __m128i latin1_packed = __lsx_vshuf_b(in2, in1, (__m128i)shuf_mask);
+      // 2. store (8 bytes)
+      __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+      // 3. adjust pointers
+      buf += 8;
+      latin1_output += 8;
+    } else {
+      return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
+    }
+  } // while
+  return std::make_pair(buf, latin1_output);
+}
 
-using namespace simd;
+std::pair<result, char *>
+lsx_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+                                        char *latin1_output) {
+  const char32_t *start = buf;
+  const char32_t *end = buf + len;
 
-template <endianness endian>
-simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
-                                         char16_t *utf16_output) noexcept {
-  // The implementation is not specific to haswell and should be moved to the
-  // generic directory.
-  size_t pos = 0;
-  char16_t *start{utf16_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while (pos + 64 + safety_margin <= size) {
-    // this loop could be unrolled further. For example, we could process the
-    // mask far more than 64 bytes.
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if (in.is_ascii()) {
-      in.store_ascii_as_utf16<endian>(utf16_output);
-      utf16_output += 64;
-      pos += 64;
+  const v16u8 shuf_mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
+  __m128i v_ff = __lsx_vrepli_w(0xFF);
+
+  while (buf + 16 <= end) {
+    __m128i in1 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m128i in2 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
+
+    __m128i in12 = __lsx_vor_v(in1, in2);
+
+    if (__lsx_bz_v(__lsx_vslt_wu(v_ff, in12))) {
+      // 1. pack the bytes
+      __m128i latin1_packed = __lsx_vshuf_b(in2, in1, (__m128i)shuf_mask);
+      // 2. store (8 bytes)
+      __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+      // 3. adjust pointers
+      buf += 8;
+      latin1_output += 8;
     } else {
-      // Slow path. We hope that the compiler will recognize that this is a slow
-      // path. Anything that is not a continuation mask is a 'leading byte',
-      // that is, the start of a new code point.
-      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-      // -65 is 0b10111111 in two-complement's, so largest possible continuation
-      // byte
-      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-      // The *start* of code points is not so useful, rather, we want the *end*
-      // of code points.
-      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-      // We process in blocks of up to 12 bytes except possibly
-      // for fast paths which may process up to 16 bytes. For the
-      // slow path to work, we should have at least 12 input bytes left.
-      size_t max_starting_point = (pos + 64) - 12;
-      // Next loop is going to run at least five times when using solely
-      // the slow/regular path, and at least four times if there are fast paths.
-      while (pos < max_starting_point) {
-        // Performance note: our ability to compute 'consumed' and
-        // then shift and recompute is critical. If there is a
-        // latency of, say, 4 cycles on getting 'consumed', then
-        // the inner loop might have a total latency of about 6 cycles.
-        // Yet we process between 6 to 12 inputs bytes, thus we get
-        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-        // for this section of the code. Hence, there is a limit
-        // to how much we can further increase this latency before
-        // it seriously harms performance.
-        //
-        // Thus we may allow convert_masked_utf8_to_utf16 to process
-        // more bytes at a time under a fast-path mode where 16 bytes
-        // are consumed at once (e.g., when encountering ASCII).
-        size_t consumed = convert_masked_utf8_to_utf16<endian>(
-            input + pos, utf8_end_of_code_point_mask, utf16_output);
-        pos += consumed;
-        utf8_end_of_code_point_mask >>= consumed;
+      // Let us do a scalar fallback.
+      for (int k = 0; k < 8; k++) {
+        uint32_t word = buf[k];
+        if (word <= 0xff) {
+          *latin1_output++ = char(word);
+        } else {
+          return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
+                                latin1_output);
+        }
       }
-      // At this point there may remain between 0 and 12 bytes in the
-      // 64-byte block. These bytes will be processed again. So we have an
-      // 80% efficiency (in the worst case). In practice we expect an
-      // 85% to 90% efficiency.
     }
-  }
-  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
-      input + pos, size - pos, utf16_output);
-  return utf16_output - start;
+  } // while
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        latin1_output);
 }
+/* end file src/lsx/lsx_convert_utf32_to_latin1.cpp */
+/* begin file src/lsx/lsx_convert_utf32_to_utf8.cpp */
+std::pair<const char32_t *, char *>
+lsx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+  const char32_t *end = buf + len;
 
-} // namespace utf8_to_utf16
-} // unnamed namespace
-} // namespace ppc64
-} // namespace simdutf
-/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
-// transcoding from UTF-8 to UTF-32
-/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
+  __m128i v_c080 = __lsx_vreplgr2vr_h(uint16_t(0xC080));
+  __m128i v_07ff = __lsx_vreplgr2vr_h(uint16_t(0x7FF));
+  __m128i v_dfff = __lsx_vreplgr2vr_h(uint16_t(0xDFFF));
+  __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
+  __m128i forbidden_bytemask = __lsx_vldi(0x0);
 
-namespace simdutf {
-namespace ppc64 {
-namespace {
-namespace utf8_to_utf32 {
-using namespace simd;
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
 
-simdutf_really_inline simd8<uint8_t>
-check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-  // Bit 1 = Too Long (ASCII followed by continuation)
-  // Bit 2 = Overlong 3-byte
-  // Bit 4 = Surrogate
-  // Bit 5 = Overlong 2-byte
-  // Bit 7 = Two Continuations
-  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
-                                               // 11______ 11______
-  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
-  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
-  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
-  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
-                                               // 11110100 101_____
-                                               // 11110101 1001____
-                                               // 11110101 101_____
-                                               // 1111011_ 1001____
-                                               // 1111011_ 101_____
-                                               // 11111___ 1001____
-                                               // 11111___ 101_____
-  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-  // 11110101 1000____
-  // 1111011_ 1000____
-  // 11111___ 1000____
-  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+  while (buf + 16 + safety_margin < end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m128i nextin = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
 
-  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
-  constexpr const uint8_t CARRY =
-      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
-  const simd8<uint8_t> byte_1_low =
-      (prev1 & 0x0F)
-          .lookup_16<uint8_t>(
-              // ____0000 ________
-              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-              // ____0001 ________
-              CARRY | OVERLONG_2,
-              // ____001_ ________
-              CARRY, CARRY,
+    // Check if no bits set above 16th
+    if (__lsx_bz_v(__lsx_vpickod_h(in, nextin))) {
+      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+      // Apply UTF-16 => UTF-8 routine (lsx_convert_utf16_to_utf8.cpp)
+      __m128i utf16_packed = __lsx_vpickev_h(nextin, in);
 
-              // ____0100 ________
-              CARRY | TOO_LARGE,
-              // ____0101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____011_ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
+      if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F),
+                                   utf16_packed))) { // ASCII fast path!!!!
+        // 1. pack the bytes
+        // obviously suboptimal.
+        __m128i utf8_packed = __lsx_vpickev_b(utf16_packed, utf16_packed);
+        // 2. store (8 bytes)
+        __lsx_vst(utf8_packed, utf8_output, 0);
+        // 3. adjust pointers
+        buf += 8;
+        utf8_output += 8;
+        continue; // we are done for this round!
+      }
+      __m128i zero = __lsx_vldi(0);
+      if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, utf16_packed))) {
+        // 1. prepare 2-byte values
+        // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+        // expected output   : [110a|aaaa|10bb|bbbb] x 8
 
-              // ____1___ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____1101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000);
-  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT,
+        // t0 = [000a|aaaa|bbbb|bb00]
+        const __m128i t0 = __lsx_vslli_h(utf16_packed, 2);
+        // t1 = [000a|aaaa|0000|0000]
+        const __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785 /*0x1f00*/));
+        // t2 = [0000|0000|00bb|bbbb]
+        const __m128i t2 = __lsx_vand_v(utf16_packed, __lsx_vrepli_h(0x3f));
+        // t3 = [000a|aaaa|00bb|bbbb]
+        const __m128i t3 = __lsx_vor_v(t1, t2);
+        // t4 = [110a|aaaa|10bb|bbbb]
+        const __m128i t4 = __lsx_vor_v(t3, v_c080);
+        // 2. merge ASCII and 2-byte codewords
+        __m128i one_byte_bytemask =
+            __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F /*0x007F*/));
+        __m128i utf8_unpacked =
+            __lsx_vbitsel_v(t4, utf16_packed, one_byte_bytemask);
+        // 3. prepare bitmask for 8-bit lookup
+        uint32_t m2 =
+            __lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0);
+        // 4. pack the bytes
+        const uint8_t *row =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                [lsx_1_2_utf8_bytes_mask[m2]][0];
+        __m128i shuffle = __lsx_vld(row, 1);
+        __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);
+        // 5. store bytes
+        __lsx_vst(utf8_packed, utf8_output, 0);
 
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
-          OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        // 6. adjust pointers
+        buf += 8;
+        utf8_output += row[0];
+        continue;
+      } else {
+        // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+        forbidden_bytemask = __lsx_vor_v(
+            __lsx_vand_v(
+                __lsx_vsle_h(utf16_packed, v_dfff),  // utf16_packed <= 0xdfff
+                __lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+            forbidden_bytemask);
+        /* In this branch we handle three cases:
+    1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single
+    UFT-8 byte
+    2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
+    UTF-8 bytes
+    3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three
+    UTF-8 bytes
+
+    We expand the input word (16-bit) into two code units (32-bit), thus
+    we have room for four bytes. However, we need five distinct bit
+    layouts. Note that the last byte in cases #2 and #3 is the same.
+
+    We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+    in register t2.
+
+    We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+    either byte 1 for case #2 or byte 2 for case #3. Note that they
+    differ by exactly one bit.
+
+    Finally from these two code units we build proper UTF-8 sequence, taking
+    into account the case (i.e, the number of bytes to write).
+  */
+        /**
+         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+         * t2 => [0ccc|cccc] [10cc|cccc]
+         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+         */
+        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+        __m128i t0 = __lsx_vpickev_b(utf16_packed, utf16_packed);
+        t0 = __lsx_vilvl_b(t0, t0);
+        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+        __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F));
+        __m128i t1 = __lsx_vand_v(t0, v_3f7f);
+        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+        __m128i t2 = __lsx_vor_v(t1, __lsx_vldi(-2688 /*0x8000*/));
 
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
-  return (byte_1_high & byte_1_low & byte_2_high);
-}
-simdutf_really_inline simd8<uint8_t>
-check_multibyte_lengths(const simd8<uint8_t> input,
-                        const simd8<uint8_t> prev_input,
-                        const simd8<uint8_t> sc) {
-  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
-  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
-  simd8<uint8_t> must23 =
-      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
-  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
-  return must23_80 ^ sc;
-}
+        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+        __m128i s0 = __lsx_vsrli_h(utf16_packed, 12);
+        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+        __m128i s1 = __lsx_vslli_h(utf16_packed, 2);
+        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+        s1 = __lsx_vand_v(s1, __lsx_vldi(-2753 /*0x3F00*/));
+        // [00bb|bbbb|0000|aaaa]
+        __m128i s2 = __lsx_vor_v(s0, s1);
+        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+        __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0));
+        __m128i s3 = __lsx_vor_v(s2, v_c0e0);
+        // __m128i v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+        __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(utf16_packed, v_07ff);
+        __m128i m0 = __lsx_vandn_v(one_or_two_bytes_bytemask,
+                                   __lsx_vldi(-2752 /*0x4000*/));
+        __m128i s4 = __lsx_vxor_v(s3, m0);
 
-struct validating_transcoder {
-  // If this is nonzero, there has been a UTF-8 error.
-  simd8<uint8_t> error;
+        // 4. expand code units 16-bit => 32-bit
+        __m128i out0 = __lsx_vilvl_h(s4, t2);
+        __m128i out1 = __lsx_vilvh_h(s4, t2);
 
-  validating_transcoder() : error(uint8_t(0)) {}
-  //
-  // Check whether the current bytes are valid UTF-8.
-  //
-  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
-                                              const simd8<uint8_t> prev_input) {
-    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
-    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
-    // small negative numbers)
-    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-    simd8<uint8_t> sc = check_special_cases(input, prev1);
-    this->error |= check_multibyte_lengths(input, prev_input, sc);
-  }
+        // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+        __m128i one_byte_bytemask =
+            __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F));
+
+        __m128i one_or_two_bytes_bytemask_u16_to_u32_low =
+            __lsx_vilvl_h(one_or_two_bytes_bytemask, zero);
+        __m128i one_or_two_bytes_bytemask_u16_to_u32_high =
+            __lsx_vilvh_h(one_or_two_bytes_bytemask, zero);
+
+        __m128i one_byte_bytemask_u16_to_u32_low =
+            __lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask);
+        __m128i one_byte_bytemask_u16_to_u32_high =
+            __lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+        const uint32_t mask0 =
+            __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v(
+                                    one_or_two_bytes_bytemask_u16_to_u32_low,
+                                    one_byte_bytemask_u16_to_u32_low)),
+                                0);
+        const uint32_t mask1 =
+            __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v(
+                                    one_or_two_bytes_bytemask_u16_to_u32_high,
+                                    one_byte_bytemask_u16_to_u32_high)),
+                                0);
 
-  simdutf_really_inline size_t convert(const char *in, size_t size,
-                                       char32_t *utf32_output) {
-    size_t pos = 0;
-    char32_t *start{utf32_output};
-    // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
-    // last 16 bytes, and if the data is valid, then it is entirely safe because
-    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
-    // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 16 leading bytes, to give us a good margin.
-    size_t leading_byte = 0;
-    size_t margin = size;
-    for (; margin > 0 && leading_byte < 8; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) > -65);
-    }
-    // If the input is long enough, then we have that margin-1 is the fourth
-    // last leading byte.
-    const size_t safety_margin = size - margin + 1; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      if (input.is_ascii()) {
-        input.store_ascii_as_utf32(utf32_output);
-        utf32_output += 64;
-        pos += 64;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio,
-        // it is not good enough.
-        static_assert(
-            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        auto zero = simd8<uint8_t>{uint8_t(0)};
-        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-        }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        if (utf8_continuation_mask & 1) {
-          return 0; // we have an error
-        }
-        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-        // We process in blocks of up to 12 bytes except possibly
-        // for fast paths which may process up to 16 bytes. For the
-        // slow path to work, we should have at least 12 input bytes left.
-        size_t max_starting_point = (pos + 64) - 12;
-        // Next loop is going to run at least five times.
-        while (pos < max_starting_point) {
-          // Performance note: our ability to compute 'consumed' and
-          // then shift and recompute is critical. If there is a
-          // latency of, say, 4 cycles on getting 'consumed', then
-          // the inner loop might have a total latency of about 6 cycles.
-          // Yet we process between 6 to 12 inputs bytes, thus we get
-          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-          // for this section of the code. Hence, there is a limit
-          // to how much we can further increase this latency before
-          // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf32(
-              in + pos, utf8_end_of_code_point_mask, utf32_output);
-          pos += consumed;
-          utf8_end_of_code_point_mask >>= consumed;
-        }
-        // At this point there may remain between 0 and 12 bytes in the
-        // 64-byte block. These bytes will be processed again. So we have an
-        // 80% efficiency (in the worst case). In practice we expect an
-        // 85% to 90% efficiency.
+        const uint8_t *row0 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+        __m128i shuffle0 = __lsx_vld(row0, 1);
+        __m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0);
+
+        const uint8_t *row1 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+        __m128i shuffle1 = __lsx_vld(row1, 1);
+        __m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1);
+
+        __lsx_vst(utf8_0, utf8_output, 0);
+        utf8_output += row0[0];
+        __lsx_vst(utf8_1, utf8_output, 0);
+        utf8_output += row1[0];
+
+        buf += 8;
       }
-    }
-    if (errors()) {
-      return 0;
-    }
-    if (pos < size) {
-      size_t howmany =
-          scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
-      if (howmany == 0) {
-        return 0;
+      // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
+      // will produce four UTF-8 bytes.
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFFFF80) == 0) {
+          *utf8_output++ = char(word);
+        } else if ((word & 0xFFFFF800) == 0) {
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xFFFF0000) == 0) {
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char *>(utf8_output));
+          }
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          if (word > 0x10FFFF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char *>(utf8_output));
+          }
+          *utf8_output++ = char((word >> 18) | 0b11110000);
+          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        }
       }
-      utf32_output += howmany;
+      buf += k;
     }
-    return utf32_output - start;
+  } // while
+
+  // check for invalid input
+  if (__lsx_bnz_v(forbidden_bytemask)) {
+    return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
   }
+  return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
+}
 
-  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
-                                                   char32_t *utf32_output) {
-    size_t pos = 0;
-    char32_t *start{utf32_output};
-    // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
-    // last 16 bytes, and if the data is valid, then it is entirely safe because
-    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
-    // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 8 leading bytes, to give us a good margin.
-    size_t leading_byte = 0;
-    size_t margin = size;
-    for (; margin > 0 && leading_byte < 8; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) > -65);
-    }
-    // If the input is long enough, then we have that margin-1 is the fourth
-    // last leading byte.
-    const size_t safety_margin = size - margin + 1; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      if (input.is_ascii()) {
-        input.store_ascii_as_utf32(utf32_output);
-        utf32_output += 64;
-        pos += 64;
+std::pair<result, char *>
+lsx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
+                                      char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+  const char32_t *start = buf;
+  const char32_t *end = buf + len;
+
+  __m128i v_c080 = __lsx_vreplgr2vr_h(uint16_t(0xC080));
+  __m128i v_07ff = __lsx_vreplgr2vr_h(uint16_t(0x7FF));
+  __m128i v_dfff = __lsx_vreplgr2vr_h(uint16_t(0xDFFF));
+  __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
+  __m128i forbidden_bytemask = __lsx_vldi(0x0);
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+
+  while (buf + 16 + safety_margin < end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m128i nextin = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
+
+    // Check if no bits set above 16th
+    if (__lsx_bz_v(__lsx_vpickod_h(in, nextin))) {
+      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+      // Apply UTF-16 => UTF-8 routine (lsx_convert_utf16_to_utf8.cpp)
+      __m128i utf16_packed = __lsx_vpickev_h(nextin, in);
+
+      if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F),
+                                   utf16_packed))) { // ASCII fast path!!!!
+        // 1. pack the bytes
+        // obviously suboptimal.
+        __m128i utf8_packed = __lsx_vpickev_b(utf16_packed, utf16_packed);
+        // 2. store (8 bytes)
+        __lsx_vst(utf8_packed, utf8_output, 0);
+        // 3. adjust pointers
+        buf += 8;
+        utf8_output += 8;
+        continue; // we are done for this round!
+      }
+      __m128i zero = __lsx_vldi(0);
+      if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, utf16_packed))) {
+        // 1. prepare 2-byte values
+        // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+        // expected output   : [110a|aaaa|10bb|bbbb] x 8
+
+        // t0 = [000a|aaaa|bbbb|bb00]
+        const __m128i t0 = __lsx_vslli_h(utf16_packed, 2);
+        // t1 = [000a|aaaa|0000|0000]
+        const __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785 /*0x1f00*/));
+        // t2 = [0000|0000|00bb|bbbb]
+        const __m128i t2 = __lsx_vand_v(utf16_packed, __lsx_vrepli_h(0x3f));
+        // t3 = [000a|aaaa|00bb|bbbb]
+        const __m128i t3 = __lsx_vor_v(t1, t2);
+        // t4 = [110a|aaaa|10bb|bbbb]
+        const __m128i t4 = __lsx_vor_v(t3, v_c080);
+        // 2. merge ASCII and 2-byte codewords
+        __m128i one_byte_bytemask =
+            __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F /*0x007F*/));
+        __m128i utf8_unpacked =
+            __lsx_vbitsel_v(t4, utf16_packed, one_byte_bytemask);
+        // 3. prepare bitmask for 8-bit lookup
+        uint32_t m2 =
+            __lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0);
+        // 4. pack the bytes
+        const uint8_t *row =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                [lsx_1_2_utf8_bytes_mask[m2]][0];
+        __m128i shuffle = __lsx_vld(row, 1);
+        __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);
+        // 5. store bytes
+        __lsx_vst(utf8_packed, utf8_output, 0);
+
+        // 6. adjust pointers
+        buf += 8;
+        utf8_output += row[0];
+        continue;
       } else {
-        // you might think that a for-loop would work, but under Visual Studio,
-        // it is not good enough.
-        static_assert(
-            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        auto zero = simd8<uint8_t>{uint8_t(0)};
-        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-        }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        if (errors() || (utf8_continuation_mask & 1)) {
-          result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
-              pos, in + pos, size - pos, utf32_output);
-          res.count += pos;
-          return res;
-        }
-        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-        // We process in blocks of up to 12 bytes except possibly
-        // for fast paths which may process up to 16 bytes. For the
-        // slow path to work, we should have at least 12 input bytes left.
-        size_t max_starting_point = (pos + 64) - 12;
-        // Next loop is going to run at least five times.
-        while (pos < max_starting_point) {
-          // Performance note: our ability to compute 'consumed' and
-          // then shift and recompute is critical. If there is a
-          // latency of, say, 4 cycles on getting 'consumed', then
-          // the inner loop might have a total latency of about 6 cycles.
-          // Yet we process between 6 to 12 inputs bytes, thus we get
-          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-          // for this section of the code. Hence, there is a limit
-          // to how much we can further increase this latency before
-          // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf32(
-              in + pos, utf8_end_of_code_point_mask, utf32_output);
-          pos += consumed;
-          utf8_end_of_code_point_mask >>= consumed;
+        // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+        forbidden_bytemask = __lsx_vor_v(
+            __lsx_vand_v(
+                __lsx_vsle_h(utf16_packed, v_dfff),  // utf16_packed <= 0xdfff
+                __lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+            forbidden_bytemask);
+        if (__lsx_bnz_v(forbidden_bytemask)) {
+          return std::make_pair(result(error_code::SURROGATE, buf - start),
+                                reinterpret_cast<char *>(utf8_output));
         }
-        // At this point there may remain between 0 and 12 bytes in the
-        // 64-byte block. These bytes will be processed again. So we have an
-        // 80% efficiency (in the worst case). In practice we expect an
-        // 85% to 90% efficiency.
-      }
-    }
-    if (errors()) {
-      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
-          pos, in + pos, size - pos, utf32_output);
-      res.count += pos;
-      return res;
-    }
-    if (pos < size) {
-      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
-          pos, in + pos, size - pos, utf32_output);
-      if (res.error) { // In case of error, we want the error position
-        res.count += pos;
-        return res;
-      } else { // In case of success, we want the number of word written
-        utf32_output += res.count;
-      }
-    }
-    return result(error_code::SUCCESS, utf32_output - start);
-  }
+        /* In this branch we handle three cases:
+    1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single
+    UFT-8 byte
+    2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
+    UTF-8 bytes
+    3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three
+    UTF-8 bytes
+
+    We expand the input word (16-bit) into two code units (32-bit), thus
+    we have room for four bytes. However, we need five distinct bit
+    layouts. Note that the last byte in cases #2 and #3 is the same.
+
+    We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+    in register t2.
+
+    We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+    either byte 1 for case #2 or byte 2 for case #3. Note that they
+    differ by exactly one bit.
+
+    Finally from these two code units we build proper UTF-8 sequence, taking
+    into account the case (i.e, the number of bytes to write).
+  */
+        /**
+         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+         * t2 => [0ccc|cccc] [10cc|cccc]
+         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+         */
+        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+        __m128i t0 = __lsx_vpickev_b(utf16_packed, utf16_packed);
+        t0 = __lsx_vilvl_b(t0, t0);
+        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+        __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F));
+        __m128i t1 = __lsx_vand_v(t0, v_3f7f);
+        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+        __m128i t2 = __lsx_vor_v(t1, __lsx_vldi(-2688 /*0x8000*/));
 
-  simdutf_really_inline bool errors() const {
-    return this->error.any_bits_set_anywhere();
-  }
+        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+        __m128i s0 = __lsx_vsrli_h(utf16_packed, 12);
+        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+        __m128i s1 = __lsx_vslli_h(utf16_packed, 2);
+        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+        s1 = __lsx_vand_v(s1, __lsx_vldi(-2753 /*0x3F00*/));
+        // [00bb|bbbb|0000|aaaa]
+        __m128i s2 = __lsx_vor_v(s0, s1);
+        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+        __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0));
+        __m128i s3 = __lsx_vor_v(s2, v_c0e0);
+        // __m128i v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+        __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(utf16_packed, v_07ff);
+        __m128i m0 = __lsx_vandn_v(one_or_two_bytes_bytemask,
+                                   __lsx_vldi(-2752 /*0x4000*/));
+        __m128i s4 = __lsx_vxor_v(s3, m0);
 
-}; // struct utf8_checker
-} // namespace utf8_to_utf32
-} // unnamed namespace
-} // namespace ppc64
-} // namespace simdutf
-/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
-/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
+        // 4. expand code units 16-bit => 32-bit
+        __m128i out0 = __lsx_vilvl_h(s4, t2);
+        __m128i out1 = __lsx_vilvh_h(s4, t2);
 
-namespace simdutf {
-namespace ppc64 {
-namespace {
-namespace utf8_to_utf32 {
+        // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+        __m128i one_byte_bytemask =
+            __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F));
+
+        __m128i one_or_two_bytes_bytemask_u16_to_u32_low =
+            __lsx_vilvl_h(one_or_two_bytes_bytemask, zero);
+        __m128i one_or_two_bytes_bytemask_u16_to_u32_high =
+            __lsx_vilvh_h(one_or_two_bytes_bytemask, zero);
+
+        __m128i one_byte_bytemask_u16_to_u32_low =
+            __lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask);
+        __m128i one_byte_bytemask_u16_to_u32_high =
+            __lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+        const uint32_t mask0 =
+            __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v(
+                                    one_or_two_bytes_bytemask_u16_to_u32_low,
+                                    one_byte_bytemask_u16_to_u32_low)),
+                                0);
+        const uint32_t mask1 =
+            __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v(
+                                    one_or_two_bytes_bytemask_u16_to_u32_high,
+                                    one_byte_bytemask_u16_to_u32_high)),
+                                0);
 
-using namespace simd;
+        const uint8_t *row0 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+        __m128i shuffle0 = __lsx_vld(row0, 1);
+        __m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0);
 
-simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
-                                         char32_t *utf32_output) noexcept {
-  size_t pos = 0;
-  char32_t *start{utf32_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while (pos + 64 + safety_margin <= size) {
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if (in.is_ascii()) {
-      in.store_ascii_as_utf32(utf32_output);
-      utf32_output += 64;
-      pos += 64;
+        const uint8_t *row1 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+        __m128i shuffle1 = __lsx_vld(row1, 1);
+        __m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1);
+
+        __lsx_vst(utf8_0, utf8_output, 0);
+        utf8_output += row0[0];
+        __lsx_vst(utf8_1, utf8_output, 0);
+        utf8_output += row1[0];
+
+        buf += 8;
+      }
+      // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
+      // will produce four UTF-8 bytes.
     } else {
-      // -65 is 0b10111111 in two-complement's, so largest possible continuation
-      // byte
-      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-      size_t max_starting_point = (pos + 64) - 12;
-      while (pos < max_starting_point) {
-        size_t consumed = convert_masked_utf8_to_utf32(
-            input + pos, utf8_end_of_code_point_mask, utf32_output);
-        pos += consumed;
-        utf8_end_of_code_point_mask >>= consumed;
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
       }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFFFF80) == 0) {
+          *utf8_output++ = char(word);
+        } else if ((word & 0xFFFFF800) == 0) {
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xFFFF0000) == 0) {
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k),
+                reinterpret_cast<char *>(utf8_output));
+          }
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          if (word > 0x10FFFF) {
+            return std::make_pair(
+                result(error_code::TOO_LARGE, buf - start + k),
+                reinterpret_cast<char *>(utf8_output));
+          }
+          *utf8_output++ = char((word >> 18) | 0b11110000);
+          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
     }
-  }
-  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
-                                                       utf32_output);
-  return utf32_output - start;
+  } // while
+
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char *>(utf8_output));
 }
+/* end file src/lsx/lsx_convert_utf32_to_utf8.cpp */
+/* begin file src/lsx/lsx_convert_utf32_to_utf16.cpp */
+template <endianness big_endian>
+std::pair<const char32_t *, char16_t *>
+lsx_convert_utf32_to_utf16(const char32_t *buf, size_t len,
+                           char16_t *utf16_out) {
+  uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
+  const char32_t *end = buf + len;
 
-} // namespace utf8_to_utf32
-} // unnamed namespace
-} // namespace ppc64
-} // namespace simdutf
-/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
-// other functions
-/* begin file src/generic/utf16.h */
-namespace simdutf {
-namespace ppc64 {
-namespace {
-namespace utf16 {
+  __m128i forbidden_bytemask = __lsx_vrepli_h(0);
+  __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
+  __m128i v_dfff = __lsx_vreplgr2vr_h(uint16_t(0xdfff));
+  while (buf + 8 <= end) {
+    __m128i in0 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m128i in1 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
 
-template <endianness big_endian>
-simdutf_really_inline size_t count_code_points(const char16_t *in,
-                                               size_t size) {
-  size_t pos = 0;
-  size_t count = 0;
-  for (; pos < size / 32 * 32; pos += 32) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    if (!match_system(big_endian)) {
-      input.swap_bytes();
-    }
-    uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
-    count += count_ones(not_pair) / 2;
-  }
-  return count +
-         scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
-}
+    // Check if no bits set above 16th
+    if (__lsx_bz_v(__lsx_vpickod_h(in1, in0))) {
+      __m128i utf16_packed = __lsx_vpickev_h(in1, in0);
+      forbidden_bytemask = __lsx_vor_v(
+          __lsx_vand_v(
+              __lsx_vsle_h(utf16_packed, v_dfff),  // utf16_packed <= 0xdfff
+              __lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+          forbidden_bytemask);
 
-template <endianness big_endian>
-simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in,
-                                                    size_t size) {
-  size_t pos = 0;
-  size_t count = 0;
-  // This algorithm could no doubt be improved!
-  for (; pos < size / 32 * 32; pos += 32) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    if (!match_system(big_endian)) {
-      input.swap_bytes();
+      if (!match_system(big_endian)) {
+        utf16_packed = lsx_swap_bytes(utf16_packed);
+      }
+      __lsx_vst(utf16_packed, utf16_output, 0);
+      utf16_output += 8;
+      buf += 8;
+    } else {
+      size_t forward = 3;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFF0000) == 0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char16_t *>(utf16_output));
+          }
+          *utf16_output++ = !match_system(big_endian)
+                                ? char16_t(word >> 8 | word << 8)
+                                : char16_t(word);
+        } else {
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char16_t *>(utf16_output));
+          }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (!match_system(big_endian)) {
+            high_surrogate =
+                uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+            low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
+        }
+      }
+      buf += k;
     }
-    uint64_t ascii_mask = input.lteq(0x7F);
-    uint64_t twobyte_mask = input.lteq(0x7FF);
-    uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+  }
 
-    size_t ascii_count = count_ones(ascii_mask) / 2;
-    size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
-    size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
-    size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
-    count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count +
-             ascii_count;
+  // check for invalid input
+  if (__lsx_bnz_v(forbidden_bytemask)) {
+    return std::make_pair(nullptr, reinterpret_cast<char16_t *>(utf16_output));
   }
-  return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
-                                                                   size - pos);
+  return std::make_pair(buf, reinterpret_cast<char16_t *>(utf16_output));
 }
 
 template <endianness big_endian>
-simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
-                                                     size_t size) {
-  return count_code_points<big_endian>(in, size);
-}
-
-simdutf_really_inline void
-change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
-  size_t pos = 0;
-
-  while (pos < size / 32 * 32) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    input.swap_bytes();
-    input.store(reinterpret_cast<uint16_t *>(output));
-    pos += 32;
-    output += 32;
-  }
-
-  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
-}
+std::pair<result, char16_t *>
+lsx_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
+                                       char16_t *utf16_out) {
+  uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
+  const char32_t *start = buf;
+  const char32_t *end = buf + len;
 
-} // namespace utf16
-} // unnamed namespace
-} // namespace ppc64
-} // namespace simdutf
-/* end file src/generic/utf16.h */
-/* begin file src/generic/utf8.h */
+  __m128i forbidden_bytemask = __lsx_vrepli_h(0);
+  __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
+  __m128i v_dfff = __lsx_vreplgr2vr_h(uint16_t(0xdfff));
 
-namespace simdutf {
-namespace ppc64 {
-namespace {
-namespace utf8 {
+  while (buf + 8 <= end) {
+    __m128i in0 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m128i in1 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
+    // Check if no bits set above 16th
+    if (__lsx_bz_v(__lsx_vpickod_h(in1, in0))) {
+      __m128i utf16_packed = __lsx_vpickev_h(in1, in0);
+
+      forbidden_bytemask = __lsx_vor_v(
+          __lsx_vand_v(
+              __lsx_vsle_h(utf16_packed, v_dfff),  // utf16_packed <= 0xdfff
+              __lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+          forbidden_bytemask);
+      if (__lsx_bnz_v(forbidden_bytemask)) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start),
+                              reinterpret_cast<char16_t *>(utf16_output));
+      }
 
-using namespace simd;
+      if (!match_system(big_endian)) {
+        utf16_packed = lsx_swap_bytes(utf16_packed);
+      }
 
-simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
-  size_t pos = 0;
-  size_t count = 0;
-  for (; pos + 64 <= size; pos += 64) {
-    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-    uint64_t utf8_continuation_mask = input.gt(-65);
-    count += count_ones(utf8_continuation_mask);
+      __lsx_vst(utf16_packed, utf16_output, 0);
+      utf16_output += 8;
+      buf += 8;
+    } else {
+      size_t forward = 3;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFF0000) == 0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k),
+                reinterpret_cast<char16_t *>(utf16_output));
+          }
+          *utf16_output++ = !match_system(big_endian)
+                                ? char16_t(word >> 8 | word << 8)
+                                : char16_t(word);
+        } else {
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) {
+            return std::make_pair(
+                result(error_code::TOO_LARGE, buf - start + k),
+                reinterpret_cast<char16_t *>(utf16_output));
+          }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (!match_system(big_endian)) {
+            high_surrogate =
+                uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+            low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
+        }
+      }
+      buf += k;
+    }
   }
-  return count + scalar::utf8::count_code_points(in + pos, size - pos);
-}
 
-simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
-                                                    size_t size) {
-  size_t pos = 0;
-  size_t count = 0;
-  // This algorithm could no doubt be improved!
-  for (; pos + 64 <= size; pos += 64) {
-    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-    uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-    // We count one word for anything that is not a continuation (so
-    // leading bytes).
-    count += 64 - count_ones(utf8_continuation_mask);
-    int64_t utf8_4byte = input.gteq_unsigned(240);
-    count += count_ones(utf8_4byte);
-  }
-  return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char16_t *>(utf16_output));
 }
-} // namespace utf8
-} // unnamed namespace
-} // namespace ppc64
-} // namespace simdutf
-/* end file src/generic/utf8.h */
+/* end file src/lsx/lsx_convert_utf32_to_utf16.cpp */
+/* begin file src/lsx/lsx_base64.cpp */
+/**
+ * References and further reading:
+ *
+ * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
+ * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
+ * https://arxiv.org/abs/1910.05109
+ *
+ * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
+ * Instructions, ACM Transactions on the Web 12 (3), 2018.
+ * https://arxiv.org/abs/1704.00605
+ *
+ * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
+ * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
+ * Request for Comments: 4648.
+ *
+ * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
+ * http://www.alfredklomp.com/programming/sse-base64/. (2014).
+ *
+ * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
+ * acceleration. https://github.com/aklomp/base64. (2014).
+ *
+ * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
+ * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
+ *
+ * Nick Kopp. 2013. Base64 Encoding on a GPU.
+ * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
+ */
 
-//
-// Implementation-specific overrides
-//
-namespace simdutf {
-namespace ppc64 {
+template <bool isbase64url>
+size_t encode_base64(char *dst, const char *src, size_t srclen,
+                     base64_options options) {
+  // credit: Wojciech Muła
+  // SSE (lookup: pshufb improved unrolled)
+  const uint8_t *input = (const uint8_t *)src;
+  static const char *lookup_tbl =
+      isbase64url
+          ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
+          : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+  uint8_t *out = (uint8_t *)dst;
 
-simdutf_warn_unused int
-implementation::detect_encodings(const char *input,
-                                 size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  if (bom_encoding != encoding_type::unspecified) {
-    return bom_encoding;
-  }
-  // todo: reimplement as a one-pass algorithm.
-  int out = 0;
-  if (validate_utf8(input, length)) {
-    out |= encoding_type::UTF8;
-  }
-  if ((length % 2) == 0) {
-    if (validate_utf16(reinterpret_cast<const char16_t *>(input), length / 2)) {
-      out |= encoding_type::UTF16_LE;
-    }
+  v16u8 shuf;
+  __m128i v_fc0fc00, v_3f03f0, shift_r, shift_l, base64_tbl0, base64_tbl1,
+      base64_tbl2, base64_tbl3;
+  if (srclen >= 16) {
+    shuf = v16u8{1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10};
+    v_fc0fc00 = __lsx_vreplgr2vr_w(uint32_t(0x0fc0fc00));
+    v_3f03f0 = __lsx_vreplgr2vr_w(uint32_t(0x003f03f0));
+    shift_r = __lsx_vreplgr2vr_w(uint32_t(0x0006000a));
+    shift_l = __lsx_vreplgr2vr_w(uint32_t(0x00080004));
+    base64_tbl0 = __lsx_vld(lookup_tbl, 0);
+    base64_tbl1 = __lsx_vld(lookup_tbl, 16);
+    base64_tbl2 = __lsx_vld(lookup_tbl, 32);
+    base64_tbl3 = __lsx_vld(lookup_tbl, 48);
   }
-  if ((length % 4) == 0) {
-    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
-      out |= encoding_type::UTF32_LE;
-    }
+
+  size_t i = 0;
+  for (; i + 52 <= srclen; i += 48) {
+    __m128i in0 =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 0);
+    __m128i in1 =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 1);
+    __m128i in2 =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 2);
+    __m128i in3 =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 3);
+
+    in0 = __lsx_vshuf_b(in0, in0, (__m128i)shuf);
+    in1 = __lsx_vshuf_b(in1, in1, (__m128i)shuf);
+    in2 = __lsx_vshuf_b(in2, in2, (__m128i)shuf);
+    in3 = __lsx_vshuf_b(in3, in3, (__m128i)shuf);
+
+    __m128i t0_0 = __lsx_vand_v(in0, v_fc0fc00);
+    __m128i t0_1 = __lsx_vand_v(in1, v_fc0fc00);
+    __m128i t0_2 = __lsx_vand_v(in2, v_fc0fc00);
+    __m128i t0_3 = __lsx_vand_v(in3, v_fc0fc00);
+
+    __m128i t1_0 = __lsx_vsrl_h(t0_0, shift_r);
+    __m128i t1_1 = __lsx_vsrl_h(t0_1, shift_r);
+    __m128i t1_2 = __lsx_vsrl_h(t0_2, shift_r);
+    __m128i t1_3 = __lsx_vsrl_h(t0_3, shift_r);
+
+    __m128i t2_0 = __lsx_vand_v(in0, v_3f03f0);
+    __m128i t2_1 = __lsx_vand_v(in1, v_3f03f0);
+    __m128i t2_2 = __lsx_vand_v(in2, v_3f03f0);
+    __m128i t2_3 = __lsx_vand_v(in3, v_3f03f0);
+
+    __m128i t3_0 = __lsx_vsll_h(t2_0, shift_l);
+    __m128i t3_1 = __lsx_vsll_h(t2_1, shift_l);
+    __m128i t3_2 = __lsx_vsll_h(t2_2, shift_l);
+    __m128i t3_3 = __lsx_vsll_h(t2_3, shift_l);
+
+    __m128i input0 = __lsx_vor_v(t1_0, t3_0);
+    __m128i input0_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input0);
+    __m128i input0_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2,
+                                         __lsx_vsub_b(input0, __lsx_vldi(32)));
+    __m128i input0_mask = __lsx_vslei_bu(input0, 31);
+    __m128i input0_result =
+        __lsx_vbitsel_v(input0_shuf1, input0_shuf0, input0_mask);
+    __lsx_vst(input0_result, reinterpret_cast<__m128i *>(out), 0);
+    out += 16;
+
+    __m128i input1 = __lsx_vor_v(t1_1, t3_1);
+    __m128i input1_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input1);
+    __m128i input1_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2,
+                                         __lsx_vsub_b(input1, __lsx_vldi(32)));
+    __m128i input1_mask = __lsx_vslei_bu(input1, 31);
+    __m128i input1_result =
+        __lsx_vbitsel_v(input1_shuf1, input1_shuf0, input1_mask);
+    __lsx_vst(input1_result, reinterpret_cast<__m128i *>(out), 0);
+    out += 16;
+
+    __m128i input2 = __lsx_vor_v(t1_2, t3_2);
+    __m128i input2_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input2);
+    __m128i input2_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2,
+                                         __lsx_vsub_b(input2, __lsx_vldi(32)));
+    __m128i input2_mask = __lsx_vslei_bu(input2, 31);
+    __m128i input2_result =
+        __lsx_vbitsel_v(input2_shuf1, input2_shuf0, input2_mask);
+    __lsx_vst(input2_result, reinterpret_cast<__m128i *>(out), 0);
+    out += 16;
+
+    __m128i input3 = __lsx_vor_v(t1_3, t3_3);
+    __m128i input3_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input3);
+    __m128i input3_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2,
+                                         __lsx_vsub_b(input3, __lsx_vldi(32)));
+    __m128i input3_mask = __lsx_vslei_bu(input3, 31);
+    __m128i input3_result =
+        __lsx_vbitsel_v(input3_shuf1, input3_shuf0, input3_mask);
+    __lsx_vst(input3_result, reinterpret_cast<__m128i *>(out), 0);
+    out += 16;
   }
+  for (; i + 16 <= srclen; i += 12) {
 
-  return out;
-}
+    __m128i in = __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 0);
 
-simdutf_warn_unused bool
-implementation::validate_utf8(const char *buf, size_t len) const noexcept {
-  return ppc64::utf8_validation::generic_validate_utf8(buf, len);
-}
+    // bytes from groups A, B and C are needed in separate 32-bit lanes
+    // in = [DDDD|CCCC|BBBB|AAAA]
+    //
+    //      an input triplet has layout
+    //      [????????|ccdddddd|bbbbcccc|aaaaaabb]
+    //        byte 3   byte 2   byte 1   byte 0    -- byte 3 comes from the next
+    //        triplet
+    //
+    //      shuffling changes the order of bytes: 1, 0, 2, 1
+    //      [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc]
+    //           ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^
+    //                  processed bits
+    in = __lsx_vshuf_b(in, in, (__m128i)shuf);
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(
-    const char *buf, size_t len) const noexcept {
-  return ppc64::utf8_validation::generic_validate_utf8_with_errors(buf, len);
-}
+    // unpacking
+    // t0    = [0000cccc|cc000000|aaaaaa00|00000000]
+    __m128i t0 = __lsx_vand_v(in, v_fc0fc00);
+    // t1    = [00000000|00cccccc|00000000|00aaaaaa]
+    //          ((c >> 6),  (a >> 10))
+    __m128i t1 = __lsx_vsrl_h(t0, shift_r);
 
-simdutf_warn_unused bool
-implementation::validate_ascii(const char *buf, size_t len) const noexcept {
-  return ppc64::utf8_validation::generic_validate_ascii(buf, len);
-}
+    // t2    = [00000000|00dddddd|000000bb|bbbb0000]
+    __m128i t2 = __lsx_vand_v(in, v_3f03f0);
+    // t3    = [00dddddd|00000000|00bbbbbb|00000000]
+    //          ((d << 8), (b << 4))
+    __m128i t3 = __lsx_vsll_h(t2, shift_l);
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(
-    const char *buf, size_t len) const noexcept {
-  return ppc64::utf8_validation::generic_validate_ascii_with_errors(buf, len);
-}
+    // res   = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3
+    __m128i indices = __lsx_vor_v(t1, t3);
 
-simdutf_warn_unused bool
-implementation::validate_utf16le(const char16_t *buf,
-                                 size_t len) const noexcept {
-  return scalar::utf16::validate<endianness::LITTLE>(buf, len);
-}
+    __m128i indices_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, indices);
+    __m128i indices_shuf1 = __lsx_vshuf_b(
+        base64_tbl3, base64_tbl2, __lsx_vsub_b(indices, __lsx_vldi(32)));
+    __m128i indices_mask = __lsx_vslei_bu(indices, 31);
+    __m128i indices_result =
+        __lsx_vbitsel_v(indices_shuf1, indices_shuf0, indices_mask);
 
-simdutf_warn_unused bool
-implementation::validate_utf16be(const char16_t *buf,
-                                 size_t len) const noexcept {
-  return scalar::utf16::validate<endianness::BIG>(buf, len);
-}
+    __lsx_vst(indices_result, reinterpret_cast<__m128i *>(out), 0);
+    out += 16;
+  }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(
-    const char16_t *buf, size_t len) const noexcept {
-  return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
+  return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
+                                                        srclen - i, options);
 }
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(
-    const char16_t *buf, size_t len) const noexcept {
-  return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
-}
+static inline void compress(__m128i data, uint16_t mask, char *output) {
+  if (mask == 0) {
+    __lsx_vst(data, reinterpret_cast<__m128i *>(output), 0);
+    return;
+  }
+  // this particular implementation was inspired by work done by @animetosho
+  // we do it in two steps, first 8 bytes and then second 8 bytes
+  uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
+  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
+  // next line just loads the 64-bit values thintable_epi8[mask1] and
+  // thintable_epi8[mask2] into a 128-bit register, using only
+  // two instructions on most compilers.
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(
-    const char32_t *buf, size_t len) const noexcept {
-  return scalar::utf32::validate_with_errors(buf, len);
-}
+  v2u64 shufmask = {tables::base64::thintable_epi8[mask1],
+                    tables::base64::thintable_epi8[mask2]};
 
-simdutf_warn_unused bool
-implementation::validate_utf32(const char16_t *buf, size_t len) const noexcept {
-  return scalar::utf32::validate(buf, len);
-}
+  // we increment by 0x08 the second half of the mask
+  v4u32 hi = {0, 0, 0x08080808, 0x08080808};
+  __m128i shufmask1 = __lsx_vadd_b((__m128i)shufmask, (__m128i)hi);
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
-    const char * /*buf*/, size_t /*len*/,
-    char16_t * /*utf16_output*/) const noexcept {
-  return 0; // stub
-}
+  // this is the version "nearly pruned"
+  __m128i pruned = __lsx_vshuf_b(data, data, shufmask1);
+  // we still need to put the two halves together.
+  // we compute the popcount of the first half:
+  int pop1 = tables::base64::BitsSetTable256mul2[mask1];
+  // then load the corresponding mask, what it does is to write
+  // only the first pop1 bytes from the first 8 bytes, and then
+  // it fills in with the bytes from the second 8 bytes + some filling
+  // at the end.
+  __m128i compactmask =
+      __lsx_vld(reinterpret_cast<const __m128i *>(
+                    tables::base64::pshufb_combine_table + pop1 * 8),
+                0);
+  __m128i answer = __lsx_vshuf_b(pruned, pruned, compactmask);
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
-    const char * /*buf*/, size_t /*len*/,
-    char16_t * /*utf16_output*/) const noexcept {
-  return 0; // stub
+  __lsx_vst(answer, reinterpret_cast<__m128i *>(output), 0);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
-    const char * /*buf*/, size_t /*len*/,
-    char16_t * /*utf16_output*/) const noexcept {
-  return result(error_code::OTHER, 0); // stub
-}
+struct block64 {
+  __m128i chunks[4];
+};
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
-    const char * /*buf*/, size_t /*len*/,
-    char16_t * /*utf16_output*/) const noexcept {
-  return result(error_code::OTHER, 0); // stub
-}
+template <bool base64_url>
+static inline uint16_t to_base64_mask(__m128i *src, bool *error) {
+  const v16u8 ascii_space_tbl = {0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                                 0x0,  0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0};
+  // credit: aqrit
+  /*
+  '0'(0x30)-'9'(0x39) => delta_values_index = 4
+  'A'(0x41)-'Z'(0x5a) => delta_values_index = 4/5/12(4+8)
+  'a'(0x61)-'z'(0x7a) => delta_values_index = 6/7/14(6+8)
+  '+'(0x2b)           => delta_values_index = 3
+  '/'(0x2f)           => delta_values_index = 2+8 = 10
+  '-'(0x2d)           => delta_values_index = 2+8 = 10
+  '_'(0x5f)           => delta_values_index = 5+8 = 13
+  */
+  v16u8 delta_asso = {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+                      0x0, 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF};
+  v16i8 delta_values;
+  if (base64_url) {
+    delta_values =
+        v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+              int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+              int8_t(0xB9), int8_t(0x00), int8_t(0x11), int8_t(0xC3),
+              int8_t(0xBF), int8_t(0xE0), int8_t(0xB9), int8_t(0xB9)};
+  } else {
+    delta_values =
+        v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+              int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+              int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
+              int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9)};
+  }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
-    const char * /*buf*/, size_t /*len*/,
-    char16_t * /*utf16_output*/) const noexcept {
-  return 0; // stub
-}
+  v16u8 check_asso;
+  if (base64_url) {
+    check_asso = v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                       0x01, 0x01, 0x03, 0x07, 0x0B, 0x06, 0x0B, 0x12};
+  } else {
+    check_asso = v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                       0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F};
+  }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
-    const char * /*buf*/, size_t /*len*/,
-    char16_t * /*utf16_output*/) const noexcept {
-  return 0; // stub
-}
+  v16i8 check_values;
+  if (base64_url) {
+    check_values = v16i8{int8_t(0x0),  int8_t(0x80), int8_t(0x80), int8_t(0x80),
+                         int8_t(0xCF), int8_t(0xBF), int8_t(0xD3), int8_t(0xA6),
+                         int8_t(0xB5), int8_t(0x86), int8_t(0xD0), int8_t(0x80),
+                         int8_t(0xB0), int8_t(0x80), int8_t(0x0),  int8_t(0x0)};
+  } else {
+    check_values =
+        v16i8{int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+              int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
+              int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
+              int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80)};
+  }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
-    const char * /*buf*/, size_t /*len*/,
-    char32_t * /*utf16_output*/) const noexcept {
-  return 0; // stub
-}
+  const __m128i shifted = __lsx_vsrli_b(*src, 3);
+  __m128i asso_index = __lsx_vand_v(*src, __lsx_vldi(0xF));
+  const __m128i delta_hash =
+      __lsx_vavgr_bu(__lsx_vshuf_b((__m128i)delta_asso, (__m128i)delta_asso,
+                                   (__m128i)asso_index),
+                     shifted);
+  const __m128i check_hash =
+      __lsx_vavgr_bu(__lsx_vshuf_b((__m128i)check_asso, (__m128i)check_asso,
+                                   (__m128i)asso_index),
+                     shifted);
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
-    const char * /*buf*/, size_t /*len*/,
-    char32_t * /*utf16_output*/) const noexcept {
-  return result(error_code::OTHER, 0); // stub
-}
+  const __m128i out =
+      __lsx_vsadd_b(__lsx_vshuf_b((__m128i)delta_values, (__m128i)delta_values,
+                                  (__m128i)delta_hash),
+                    *src);
+  const __m128i chk =
+      __lsx_vsadd_b(__lsx_vshuf_b((__m128i)check_values, (__m128i)check_values,
+                                  (__m128i)check_hash),
+                    *src);
+  unsigned int mask = __lsx_vpickve2gr_hu(__lsx_vmskltz_b(chk), 0);
+  if (mask) {
+    __m128i ascii_space = __lsx_vseq_b(__lsx_vshuf_b((__m128i)ascii_space_tbl,
+                                                     (__m128i)ascii_space_tbl,
+                                                     (__m128i)asso_index),
+                                       *src);
+    *error |=
+        (mask != __lsx_vpickve2gr_hu(__lsx_vmskltz_b((__m128i)ascii_space), 0));
+  }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
-    const char * /*buf*/, size_t /*len*/,
-    char32_t * /*utf16_output*/) const noexcept {
-  return 0; // stub
+  *src = out;
+  return (uint16_t)mask;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len,
-                                                            utf8_output);
+template <bool base64_url>
+static inline uint64_t to_base64_mask(block64 *b, bool *error) {
+  *error = 0;
+  uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], error);
+  uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], error);
+  uint64_t m2 = to_base64_mask<base64_url>(&b->chunks[2], error);
+  uint64_t m3 = to_base64_mask<base64_url>(&b->chunks[3], error);
+  return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
+static inline void copy_block(block64 *b, char *output) {
+  __lsx_vst(b->chunks[0], reinterpret_cast<__m128i *>(output), 0);
+  __lsx_vst(b->chunks[1], reinterpret_cast<__m128i *>(output), 16);
+  __lsx_vst(b->chunks[2], reinterpret_cast<__m128i *>(output), 32);
+  __lsx_vst(b->chunks[3], reinterpret_cast<__m128i *>(output), 48);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
-      buf, len, utf8_output);
+static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
+  uint64_t nmask = ~mask;
+  uint64_t count =
+      __lsx_vpickve2gr_d(__lsx_vpcnt_h(__lsx_vreplgr2vr_d(nmask)), 0);
+  uint16_t *count_ptr = (uint16_t *)&count;
+  compress(b->chunks[0], uint16_t(mask), output);
+  compress(b->chunks[1], uint16_t(mask >> 16), output + count_ptr[0]);
+  compress(b->chunks[2], uint16_t(mask >> 32),
+           output + count_ptr[0] + count_ptr[1]);
+  compress(b->chunks[3], uint16_t(mask >> 48),
+           output + count_ptr[0] + count_ptr[1] + count_ptr[2]);
+  return count_ones(nmask);
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
-      buf, len, utf8_output);
+// The caller of this function is responsible to ensure that there are 64 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char *src) {
+  b->chunks[0] = __lsx_vld(reinterpret_cast<const __m128i *>(src), 0);
+  b->chunks[1] = __lsx_vld(reinterpret_cast<const __m128i *>(src), 16);
+  b->chunks[2] = __lsx_vld(reinterpret_cast<const __m128i *>(src), 32);
+  b->chunks[3] = __lsx_vld(reinterpret_cast<const __m128i *>(src), 48);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len,
-                                                                  utf8_output);
+// The caller of this function is responsible to ensure that there are 128 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char16_t *src) {
+  __m128i m1 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 0);
+  __m128i m2 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 16);
+  __m128i m3 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 32);
+  __m128i m4 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 48);
+  __m128i m5 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 64);
+  __m128i m6 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 80);
+  __m128i m7 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 96);
+  __m128i m8 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 112);
+  b->chunks[0] = __lsx_vssrlni_bu_h(m2, m1, 0);
+  b->chunks[1] = __lsx_vssrlni_bu_h(m4, m3, 0);
+  b->chunks[2] = __lsx_vssrlni_bu_h(m6, m5, 0);
+  b->chunks[3] = __lsx_vssrlni_bu_h(m8, m7, 0);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len,
-                                                               utf8_output);
-}
+static inline void base64_decode(char *out, __m128i str) {
+  __m128i t0 = __lsx_vor_v(
+      __lsx_vslli_w(str, 26),
+      __lsx_vslli_w(__lsx_vand_v(str, __lsx_vldi(-1758 /*0x0000FF00*/)), 12));
+  __m128i t1 =
+      __lsx_vsrli_w(__lsx_vand_v(str, __lsx_vldi(-3521 /*0x003F0000*/)), 2);
+  __m128i t2 = __lsx_vor_v(t0, t1);
+  __m128i t3 = __lsx_vor_v(t2, __lsx_vsrli_w(str, 16));
+  const v16u8 pack_shuffle = {3, 2,  1,  7,  6, 5, 11, 10,
+                              9, 15, 14, 13, 0, 0, 0,  0};
+  t3 = __lsx_vshuf_b(t3, t3, (__m128i)pack_shuffle);
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
+  // Store the output:
+  // we only need 12.
+  __lsx_vstelm_d(t3, out, 0, 0);
+  __lsx_vstelm_w(t3, out + 8, 0, 2);
 }
-
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
+// decode 64 bytes and output 48 bytes
+static inline void base64_decode_block(char *out, const char *src) {
+  base64_decode(out, __lsx_vld(reinterpret_cast<const __m128i *>(src), 0));
+  base64_decode(out + 12,
+                __lsx_vld(reinterpret_cast<const __m128i *>(src), 16));
+  base64_decode(out + 24,
+                __lsx_vld(reinterpret_cast<const __m128i *>(src), 32));
+  base64_decode(out + 36,
+                __lsx_vld(reinterpret_cast<const __m128i *>(src), 48));
 }
-
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
+static inline void base64_decode_block_safe(char *out, const char *src) {
+  base64_decode_block(out, src);
 }
-
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len,
-                                                             utf16_output);
+static inline void base64_decode_block(char *out, block64 *b) {
+  base64_decode(out, b->chunks[0]);
+  base64_decode(out + 12, b->chunks[1]);
+  base64_decode(out + 24, b->chunks[2]);
+  base64_decode(out + 36, b->chunks[3]);
 }
-
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len,
-                                                          utf16_output);
+static inline void base64_decode_block_safe(char *out, block64 *b) {
+  base64_decode_block(out, b);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
-      buf, len, utf16_output);
-}
+template <bool base64_url, typename char_type>
+full_result
+compress_decode_base64(char *dst, const char_type *src, size_t srclen,
+                       base64_options options,
+                       last_chunk_handling_options last_chunk_options) {
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+                                        : tables::base64::to_base64_value;
+  size_t equallocation =
+      srclen; // location of the first padding character if any
+  // skip trailing spaces
+  while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+         to_base64[uint8_t(src[srclen - 1])] == 64) {
+    srclen--;
+  }
+  size_t equalsigns = 0;
+  if (srclen > 0 && src[srclen - 1] == '=') {
+    equallocation = srclen - 1;
+    srclen--;
+    equalsigns = 1;
+    // skip trailing spaces
+    while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+           to_base64[uint8_t(src[srclen - 1])] == 64) {
+      srclen--;
+    }
+    if (srclen > 0 && src[srclen - 1] == '=') {
+      equallocation = srclen - 1;
+      srclen--;
+      equalsigns = 2;
+    }
+  }
+  if (srclen == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation, 0};
+    }
+    return {SUCCESS, 0, 0};
+  }
+  const char_type *const srcinit = src;
+  const char *const dstinit = dst;
+  const char_type *const srcend = src + srclen;
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
-      buf, len, utf16_output);
-}
+  constexpr size_t block_size = 10;
+  char buffer[block_size * 64];
+  char *bufferptr = buffer;
+  if (srclen >= 64) {
+    const char_type *const srcend64 = src + srclen - 64;
+    while (src <= srcend64) {
+      block64 b;
+      load_block(&b, src);
+      src += 64;
+      bool error = false;
+      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
+      if (badcharmask) {
+        if (error) {
+          src -= 64;
+          while (src < srcend && scalar::base64::is_eight_byte(*src) &&
+                 to_base64[uint8_t(*src)] <= 64) {
+            src++;
+          }
+          if (src < srcend) {
+            // should never happen
+          }
+          return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+                  size_t(dst - dstinit)};
+        }
+      }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(
-      buf, len, utf16_output);
-}
+      if (badcharmask != 0) {
+        // optimization opportunity: check for simple masks like those made of
+        // continuous 1s followed by continuous 0s. And masks containing a
+        // single bad character.
+        bufferptr += compress_block(&b, badcharmask, bufferptr);
+      } else {
+        // optimization opportunity: if bufferptr == buffer and mask == 0, we
+        // can avoid the call to compress_block and decode directly.
+        copy_block(&b, bufferptr);
+        bufferptr += 64;
+      }
+      if (bufferptr >= (block_size - 1) * 64 + buffer) {
+        for (size_t i = 0; i < (block_size - 1); i++) {
+          base64_decode_block(dst, buffer + i * 64);
+          dst += 48;
+        }
+        std::memcpy(buffer, buffer + (block_size - 1) * 64,
+                    64); // 64 might be too much
+        bufferptr -= (block_size - 1) * 64;
+      }
+    }
+  }
+  char *buffer_start = buffer;
+  // Optimization note: if this is almost full, then it is worth our
+  // time, otherwise, we should just decode directly.
+  int last_block = (int)((bufferptr - buffer_start) % 64);
+  if (last_block != 0 && srcend - src + last_block >= 64) {
+    while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
+      uint8_t val = to_base64[uint8_t(*src)];
+      *bufferptr = char(val);
+      if (!scalar::base64::is_eight_byte(*src) || val > 64) {
+        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+                size_t(dst - dstinit)};
+      }
+      bufferptr += (val <= 63);
+      src++;
+    }
+  }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len,
-                                                                utf16_output);
+  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
+    base64_decode_block(dst, buffer_start);
+    dst += 48;
+  }
+  if ((bufferptr - buffer_start) % 64 != 0) {
+    while (buffer_start + 4 < bufferptr) {
+      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                        << 8;
+      triple = scalar::utf32::swap_bytes(triple);
+      std::memcpy(dst, &triple, 4);
+
+      dst += 3;
+      buffer_start += 4;
+    }
+    if (buffer_start + 4 <= bufferptr) {
+      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                        << 8;
+      triple = scalar::utf32::swap_bytes(triple);
+      std::memcpy(dst, &triple, 3);
+
+      dst += 3;
+      buffer_start += 4;
+    }
+    // we may have 1, 2 or 3 bytes left and we need to decode them so let us
+    // backtrack
+    int leftover = int(bufferptr - buffer_start);
+    while (leftover > 0) {
+      while (to_base64[uint8_t(*(src - 1))] == 64) {
+        src--;
+      }
+      src--;
+      leftover--;
+    }
+  }
+  if (src < srcend + equalsigns) {
+    full_result r = scalar::base64::base64_tail_decode(
+        dst, src, srcend - src, equalsigns, options, last_chunk_options);
+    r.input_count += size_t(src - srcinit);
+    if (r.error == error_code::INVALID_BASE64_CHARACTER ||
+        r.error == error_code::BASE64_EXTRA_BITS) {
+      return r;
+    } else {
+      r.output_count += size_t(dst - dstinit);
+    }
+    if (last_chunk_options != stop_before_partial &&
+        r.error == error_code::SUCCESS && equalsigns > 0) {
+      // additional checks
+      if ((r.output_count % 3 == 0) ||
+          ((r.output_count % 3) + 1 + equalsigns != 4)) {
+        r.error = error_code::INVALID_BASE64_CHARACTER;
+        r.input_count = equallocation;
+      }
+    }
+    return r;
+  }
+  if (equalsigns > 0) {
+    if ((size_t(dst - dstinit) % 3 == 0) ||
+        ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
+    }
+  }
+  return {SUCCESS, srclen, size_t(dst - dstinit)};
 }
+/* end file src/lsx/lsx_base64.cpp */
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len,
-                                                             utf32_output);
+} // namespace
+} // namespace lsx
+} // namespace simdutf
+
+/* begin file src/generic/buf_block_reader.h */
+namespace simdutf {
+namespace lsx {
+namespace {
+
+// Walks through a buffer in block-sized increments, loading the last part with
+// spaces
+template <size_t STEP_SIZE> struct buf_block_reader {
+public:
+  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  simdutf_really_inline size_t block_index();
+  simdutf_really_inline bool has_full_block() const;
+  simdutf_really_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0
+   * (in which case this function fills the buffer with spaces and returns 0. In
+   * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
+   * block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
+  simdutf_really_inline void advance();
+
+private:
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
+};
+
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char *format_input_text_64(const uint8_t *text) {
+  static char *buf =
+      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len,
-                                                          utf32_output);
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char *format_input_text(const simd8x64<uint8_t> &in) {
+  static char *buf =
+      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  in.store(reinterpret_cast<uint8_t *>(buf));
+  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') {
+      buf[i] = '_';
+    }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-      buf, len, utf32_output);
+simdutf_unused static char *format_mask(uint64_t mask) {
+  static char *buf = reinterpret_cast<char *>(malloc(64 + 1));
+  for (size_t i = 0; i < 64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-      buf, len, utf32_output);
-}
+template <size_t STEP_SIZE>
+simdutf_really_inline
+buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
+    : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
+      idx{0} {}
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(
-      buf, len, utf32_output);
+template <size_t STEP_SIZE>
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
+  return idx;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len,
-                                                                utf32_output);
+template <size_t STEP_SIZE>
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
 }
 
-void implementation::change_endianness_utf16(const char16_t *input,
-                                             size_t length,
-                                             char16_t *output) const noexcept {
-  scalar::utf16::change_endianness_utf16(input, length, output);
+template <size_t STEP_SIZE>
+simdutf_really_inline const uint8_t *
+buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
 }
 
-simdutf_warn_unused size_t implementation::count_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
+template <size_t STEP_SIZE>
+simdutf_really_inline size_t
+buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  if (len == idx) {
+    return 0;
+  } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+  std::memset(dst, 0x20,
+              STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
+                          // to write out 8 or 16 bytes at once.
+  std::memcpy(dst, buf + idx, len - idx);
+  return len - idx;
 }
 
-simdutf_warn_unused size_t implementation::count_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::count_code_points<endianness::BIG>(input, length);
+template <size_t STEP_SIZE>
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
 }
 
-simdutf_warn_unused size_t
-implementation::count_utf8(const char *input, size_t length) const noexcept {
-  return utf8::count_code_points(input, length);
-}
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
+/* end file src/generic/buf_block_reader.h */
+/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+namespace simdutf {
+namespace lsx {
+namespace {
+namespace utf8_validation {
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input,
-                                                                   length);
-}
+using namespace simd;
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
-}
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input,
-                                                                    length);
-}
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
-}
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
-    const char *input, size_t length) const noexcept {
-  return scalar::utf8::utf16_length_from_utf8(input, length);
-}
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
-    const char32_t *input, size_t length) const noexcept {
-  return scalar::utf32::utf8_length_from_utf32(input, length);
-}
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
-    const char32_t *input, size_t length) const noexcept {
-  return scalar::utf32::utf16_length_from_utf32(input, length);
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
 }
-
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
-    const char *input, size_t length) const noexcept {
-  return scalar::utf8::count_code_points(input, length);
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
 }
 
-simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
-    const char *input, size_t length) const noexcept {
-  return scalar::base64::maximal_binary_length_from_base64(input, length);
+//
+// Return nonzero if there are incomplete multibyte characters at the end of the
+// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
+//
+simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+  // If the previous input's last 3 bytes match this, they're too short (they
+  // ended at EOF):
+  // ... 1111____ 111_____ 11______
+  static const uint8_t max_array[32] = {255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        0b11110000u - 1,
+                                        0b11100000u - 1,
+                                        0b11000000u - 1};
+  const simd8<uint8_t> max_value(
+      &max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
+  return input.gt_bits(max_value);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(
-    const char *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  // skip trailing spaces
-  while (length > 0 &&
-         scalar::base64::is_ascii_white_space(input[length - 1])) {
-    length--;
+struct utf8_checker {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
+  // The last input we received
+  simd8<uint8_t> prev_input_block;
+  // Whether the last input we received was incomplete (used for ASCII fast
+  // path)
+  simd8<uint8_t> prev_incomplete;
+
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
   }
-  size_t equallocation =
-      length; // location of the first padding character if any
-  size_t equalsigns = 0;
-  if (length > 0 && input[length - 1] == '=') {
-    equallocation = length - 1;
-    length -= 1;
-    equalsigns++;
-    while (length > 0 &&
-           scalar::base64::is_ascii_white_space(input[length - 1])) {
-      length--;
-    }
-    if (length > 0 && input[length - 1] == '=') {
-      equallocation = length - 1;
-      equalsigns++;
-      length -= 1;
-    }
+
+  // The only problem that can happen at EOF is that a multibyte character is
+  // too short or a byte value too large in the last bytes: check_special_cases
+  // only checks for bytes too large in the first of two bytes.
+  simdutf_really_inline void check_eof() {
+    // If the previous block had incomplete UTF-8 characters at the end, an
+    // ASCII block can't possibly finish them.
+    this->error |= this->prev_incomplete;
   }
-  if (length == 0) {
-    if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
+
+  simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
+    if (simdutf_likely(is_ascii(input))) {
+      this->error |= this->prev_incomplete;
+    } else {
+      // you might think that a for-loop would work, but under Visual Studio, it
+      // is not good enough.
+      static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                        (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+      if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+      } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+        this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+      }
+      this->prev_incomplete =
+          is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
+      this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
     }
-    return {SUCCESS, 0};
   }
-  result r = scalar::base64::base64_tail_decode(
-      output, input, length, equalsigns, options, last_chunk_options);
-  if (last_chunk_options != stop_before_partial &&
-      r.error == error_code::SUCCESS && equalsigns > 0) {
-    // additional checks
-    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
-    }
+
+  // do not forget to call check_eof!
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
   }
-  return r;
+
+}; // struct utf8_checker
+} // namespace utf8_validation
+
+using utf8_validation::utf8_checker;
+
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+/* begin file src/generic/utf8_validation/utf8_validator.h */
+namespace simdutf {
+namespace lsx {
+namespace {
+namespace utf8_validation {
+
+/**
+ * Validates that the string is actual UTF-8.
+ */
+template <class checker>
+bool generic_validate_utf8(const uint8_t *input, size_t length) {
+  checker c{};
+  buf_block_reader<64> reader(input, length);
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    c.check_next_input(in);
+    reader.advance();
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  c.check_next_input(in);
+  reader.advance();
+  c.check_eof();
+  return !c.errors();
 }
 
-simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::base64::maximal_binary_length_from_base64(input, length);
+bool generic_validate_utf8(const char *input, size_t length) {
+  return generic_validate_utf8<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(
-    const char16_t *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  // skip trailing spaces
-  while (length > 0 &&
-         scalar::base64::is_ascii_white_space(input[length - 1])) {
-    length--;
-  }
-  size_t equallocation =
-      length; // location of the first padding character if any
-  size_t equalsigns = 0;
-  if (length > 0 && input[length - 1] == '=') {
-    equallocation = length - 1;
-    length -= 1;
-    equalsigns++;
-    while (length > 0 &&
-           scalar::base64::is_ascii_white_space(input[length - 1])) {
-      length--;
-    }
-    if (length > 0 && input[length - 1] == '=') {
-      equallocation = length - 1;
-      equalsigns++;
-      length -= 1;
-    }
-  }
-  if (length == 0) {
-    if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
+/**
+ * Validates that the string is actual UTF-8 and stops on errors.
+ */
+template <class checker>
+result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
+  checker c{};
+  buf_block_reader<64> reader(input, length);
+  size_t count{0};
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    c.check_next_input(in);
+    if (c.errors()) {
+      if (count != 0) {
+        count--;
+      } // Sometimes the error is only detected in the next chunk
+      result res = scalar::utf8::rewind_and_validate_with_errors(
+          reinterpret_cast<const char *>(input),
+          reinterpret_cast<const char *>(input + count), length - count);
+      res.count += count;
+      return res;
     }
-    return {SUCCESS, 0};
+    reader.advance();
+    count += 64;
   }
-  result r = scalar::base64::base64_tail_decode(
-      output, input, length, equalsigns, options, last_chunk_options);
-  if (last_chunk_options != stop_before_partial &&
-      r.error == error_code::SUCCESS && equalsigns > 0) {
-    // additional checks
-    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
-    }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  c.check_next_input(in);
+  reader.advance();
+  c.check_eof();
+  if (c.errors()) {
+    if (count != 0) {
+      count--;
+    } // Sometimes the error is only detected in the next chunk
+    result res = scalar::utf8::rewind_and_validate_with_errors(
+        reinterpret_cast<const char *>(input),
+        reinterpret_cast<const char *>(input) + count, length - count);
+    res.count += count;
+    return res;
+  } else {
+    return result(error_code::SUCCESS, length);
   }
-  return r;
 }
 
-simdutf_warn_unused size_t implementation::base64_length_from_binary(
-    size_t length, base64_options options) const noexcept {
-  return scalar::base64::base64_length_from_binary(length, options);
+result generic_validate_utf8_with_errors(const char *input, size_t length) {
+  return generic_validate_utf8_with_errors<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
 }
 
-size_t implementation::binary_to_base64(const char *input, size_t length,
-                                        char *output,
-                                        base64_options options) const noexcept {
-  return scalar::base64::binary_to_base64(input, length, output, options);
+template <class checker>
+bool generic_validate_ascii(const uint8_t *input, size_t length) {
+  buf_block_reader<64> reader(input, length);
+  uint8_t blocks[64]{};
+  simd::simd8x64<uint8_t> running_or(blocks);
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    running_or |= in;
+    reader.advance();
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  running_or |= in;
+  return running_or.is_ascii();
 }
-} // namespace ppc64
-} // namespace simdutf
-
-/* begin file src/simdutf/ppc64/end.h */
-/* end file src/simdutf/ppc64/end.h */
-/* end file src/ppc64/implementation.cpp */
-#endif
-#if SIMDUTF_IMPLEMENTATION_RVV
-/* begin file src/rvv/implementation.cpp */
-
-
-
 
+bool generic_validate_ascii(const char *input, size_t length) {
+  return generic_validate_ascii<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
+}
 
-/* begin file src/simdutf/rvv/begin.h */
-// redefining SIMDUTF_IMPLEMENTATION to "rvv"
-// #define SIMDUTF_IMPLEMENTATION rvv
+template <class checker>
+result generic_validate_ascii_with_errors(const uint8_t *input, size_t length) {
+  buf_block_reader<64> reader(input, length);
+  size_t count{0};
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    if (!in.is_ascii()) {
+      result res = scalar::ascii::validate_with_errors(
+          reinterpret_cast<const char *>(input + count), length - count);
+      return result(res.error, count + res.count);
+    }
+    reader.advance();
 
-#if SIMDUTF_CAN_ALWAYS_RUN_RVV
-// nothing needed.
-#else
-SIMDUTF_TARGET_RVV
-#endif
-/* end file src/simdutf/rvv/begin.h */
-namespace simdutf {
-namespace rvv {
-namespace {
-#ifndef SIMDUTF_RVV_H
-  #error "rvv.h must be included"
-#endif
+    count += 64;
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  if (!in.is_ascii()) {
+    result res = scalar::ascii::validate_with_errors(
+        reinterpret_cast<const char *>(input + count), length - count);
+    return result(res.error, count + res.count);
+  } else {
+    return result(error_code::SUCCESS, length);
+  }
+}
+
+result generic_validate_ascii_with_errors(const char *input, size_t length) {
+  return generic_validate_ascii_with_errors<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
+}
 
+} // namespace utf8_validation
 } // unnamed namespace
-} // namespace rvv
+} // namespace lsx
 } // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_validator.h */
+
+// transcoding from UTF-8 to Latin 1
+/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
 
-//
-// Implementation-specific overrides
-//
 namespace simdutf {
-namespace rvv {
-/* begin file src/rvv/rvv_helpers.inl.cpp */
-template <simdutf_ByteFlip bflip>
-simdutf_really_inline static size_t
-rvv_utf32_store_utf16_m4(uint16_t *dst, vuint32m4_t utf32, size_t vl,
-                         vbool4_t m4even) {
-  /* convert [000000000000aaaa|aaaaaabbbbbbbbbb]
-   * to      [110111bbbbbbbbbb|110110aaaaaaaaaa] */
-  vuint32m4_t sur = __riscv_vsub_vx_u32m4(utf32, 0x10000, vl);
-  sur = __riscv_vor_vv_u32m4(__riscv_vsll_vx_u32m4(sur, 16, vl),
-                             __riscv_vsrl_vx_u32m4(sur, 10, vl), vl);
-  sur = __riscv_vand_vx_u32m4(sur, 0x3FF03FF, vl);
-  sur = __riscv_vor_vx_u32m4(sur, 0xDC00D800, vl);
-  /* merge 1 byte utf32 and 2 byte sur */
-  vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(utf32, 0xFFFF, vl);
-  vuint16m4_t utf32_16 = __riscv_vreinterpret_v_u32m4_u16m4(
-      __riscv_vmerge_vvm_u32m4(utf32, sur, m4, vl));
-  /* compress and store */
-  vbool4_t mOut = __riscv_vmor_mm_b4(
-      __riscv_vmsne_vx_u16m4_b4(utf32_16, 0, vl * 2), m4even, vl * 2);
-  vuint16m4_t vout = __riscv_vcompress_vm_u16m4(utf32_16, mOut, vl * 2);
-  vl = __riscv_vcpop_m_b4(mOut, vl * 2);
-  __riscv_vse16_v_u16m4(dst, simdutf_byteflip<bflip>(vout, vl), vl);
-  return vl;
-};
-/* end file src/rvv/rvv_helpers.inl.cpp */
+namespace lsx {
+namespace {
+namespace utf8_to_latin1 {
+using namespace simd;
 
-/* begin file src/rvv/rvv_length_from.inl.cpp */
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // For UTF-8 to Latin 1, we can allow any ASCII character, and any
+  // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
+  // 0b11000010 and nothing else.
+  //
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+  constexpr const uint8_t FORBIDDEN = 0xff;
 
-simdutf_warn_unused size_t
-implementation::count_utf16le(const char16_t *src, size_t len) const noexcept {
-  return utf32_length_from_utf16le(src, len);
-}
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      FORBIDDEN,
+      // 1110____ ________ <three byte lead in byte 1>
+      FORBIDDEN,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      FORBIDDEN);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
 
-simdutf_warn_unused size_t
-implementation::count_utf16be(const char16_t *src, size_t len) const noexcept {
-  return utf32_length_from_utf16be(src, len);
-}
+              // ____0100 ________
+              FORBIDDEN,
+              // ____0101 ________
+              FORBIDDEN,
+              // ____011_ ________
+              FORBIDDEN, FORBIDDEN,
 
-simdutf_warn_unused size_t
-implementation::count_utf8(const char *src, size_t len) const noexcept {
-  return utf32_length_from_utf8(src, len);
-}
+              // ____1___ ________
+              FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
+              // ____1101 ________
+              FORBIDDEN, FORBIDDEN, FORBIDDEN);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
-    const char *src, size_t len) const noexcept {
-  return utf32_length_from_utf8(src, len);
-}
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
 
-simdutf_warn_unused size_t
-implementation::latin1_length_from_utf16(size_t len) const noexcept {
-  return len;
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
 }
 
-simdutf_warn_unused size_t
-implementation::latin1_length_from_utf32(size_t len) const noexcept {
-  return len;
-}
+struct validating_transcoder {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
 
-simdutf_warn_unused size_t
-implementation::utf16_length_from_latin1(size_t len) const noexcept {
-  return len;
-}
+  validating_transcoder() : error(uint8_t(0)) {}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    this->error |= check_special_cases(input, prev1);
+  }
 
-simdutf_warn_unused size_t
-implementation::utf32_length_from_latin1(size_t len) const noexcept {
-  return len;
-}
+  simdutf_really_inline size_t convert(const char *in, size_t size,
+                                       char *latin1_output) {
+    size_t pos = 0;
+    char *start{latin1_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 16 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 16; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) >
+                       -65); // twos complement of -65 is 1011 1111 ...
+    }
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store((int8_t *)latin1_output);
+        latin1_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask =
+            input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
+                               // this case, we also have ASCII to account for.
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_latin1(
+              in + pos, utf8_end_of_code_point_mask, latin1_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      return 0;
+    }
+    if (pos < size) {
+      size_t howmany =
+          scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
+      if (howmany == 0) {
+        return 0;
+      }
+      latin1_output += howmany;
+    }
+    return latin1_output - start;
+  }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
-    const char *src, size_t len) const noexcept {
-  size_t count = 0;
-  for (size_t vl; len > 0; len -= vl, src += vl) {
-    vl = __riscv_vsetvl_e8m8(len);
-    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
-    vbool1_t mask = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl);
-    count += __riscv_vcpop_m_b1(mask, vl);
+  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+                                                   char *latin1_output) {
+    size_t pos = 0;
+    char *start{latin1_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store((int8_t *)latin1_output);
+        latin1_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        if (errors()) {
+          // rewind_and_convert_with_errors will seek a potential error from
+          // in+pos onward, with the ability to go back up to pos bytes, and
+          // read size-pos bytes forward.
+          result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+              pos, in + pos, size - pos, latin1_output);
+          res.count += pos;
+          return res;
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_latin1(
+              in + pos, utf8_end_of_code_point_mask, latin1_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, latin1_output);
+      res.count += pos;
+      return res;
+    }
+    if (pos < size) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, latin1_output);
+      if (res.error) { // In case of error, we want the error position
+        res.count += pos;
+        return res;
+      } else { // In case of success, we want the number of word written
+        latin1_output += res.count;
+      }
+    }
+    return result(error_code::SUCCESS, latin1_output - start);
   }
-  return count;
-}
 
-template <simdutf_ByteFlip bflip>
-simdutf_really_inline static size_t
-rvv_utf32_length_from_utf16(const char16_t *src, size_t len) {
-  size_t count = 0;
-  for (size_t vl; len > 0; len -= vl, src += vl) {
-    vl = __riscv_vsetvl_e16m8(len);
-    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
-    v = simdutf_byteflip<bflip>(v, vl);
-    vbool2_t notHigh =
-        __riscv_vmor_mm_b2(__riscv_vmsgtu_vx_u16m8_b2(v, 0xDFFF, vl),
-                           __riscv_vmsltu_vx_u16m8_b2(v, 0xDC00, vl), vl);
-    count += __riscv_vcpop_m_b2(notHigh, vl);
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
   }
-  return count;
-}
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
-    const char16_t *src, size_t len) const noexcept {
-  return rvv_utf32_length_from_utf16<simdutf_ByteFlip::NONE>(src, len);
-}
+}; // struct utf8_checker
+} // namespace utf8_to_latin1
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
+/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
+/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
-    const char16_t *src, size_t len) const noexcept {
-  if (supports_zvbb())
-    return rvv_utf32_length_from_utf16<simdutf_ByteFlip::ZVBB>(src, len);
-  else
-    return rvv_utf32_length_from_utf16<simdutf_ByteFlip::V>(src, len);
-}
+namespace simdutf {
+namespace lsx {
+namespace {
+namespace utf8_to_latin1 {
+using namespace simd;
 
-simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
-    const char *src, size_t len) const noexcept {
-  size_t count = len;
-  for (size_t vl; len > 0; len -= vl, src += vl) {
-    vl = __riscv_vsetvl_e8m8(len);
-    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
-    count += __riscv_vcpop_m_b1(__riscv_vmslt_vx_i8m8_b1(v, 0, vl), vl);
+simdutf_really_inline size_t convert_valid(const char *in, size_t size,
+                                           char *latin1_output) {
+  size_t pos = 0;
+  char *start{latin1_output};
+  // In the worst case, we have the haswell kernel which can cause an overflow
+  // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
+  // 16 bytes, and if the data is valid, then it is entirely safe because 16
+  // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
+  // assume that you have valid UTF-8 input, so we are going to go back from the
+  // end counting 8 leading bytes, to give us a good margin.
+  size_t leading_byte = 0;
+  size_t margin = size;
+  for (; margin > 0 && leading_byte < 8; margin--) {
+    leading_byte += (int8_t(in[margin - 1]) >
+                     -65); // twos complement of -65 is 1011 1111 ...
   }
-  return count;
-}
-
-template <simdutf_ByteFlip bflip>
-simdutf_really_inline static size_t
-rvv_utf8_length_from_utf16(const char16_t *src, size_t len) {
-  size_t count = 0;
-  for (size_t vl; len > 0; len -= vl, src += vl) {
-    vl = __riscv_vsetvl_e16m8(len);
-    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
-    v = simdutf_byteflip<bflip>(v, vl);
-    vbool2_t m234 = __riscv_vmsgtu_vx_u16m8_b2(v, 0x7F, vl);
-    vbool2_t m34 = __riscv_vmsgtu_vx_u16m8_b2(v, 0x7FF, vl);
-    vbool2_t notSur =
-        __riscv_vmor_mm_b2(__riscv_vmsltu_vx_u16m8_b2(v, 0xD800, vl),
-                           __riscv_vmsgtu_vx_u16m8_b2(v, 0xDFFF, vl), vl);
-    vbool2_t m3 = __riscv_vmand_mm_b2(m34, notSur, vl);
-    count += vl + __riscv_vcpop_m_b2(m234, vl) + __riscv_vcpop_m_b2(m3, vl);
+  // If the input is long enough, then we have that margin-1 is the eight last
+  // leading byte.
+  const size_t safety_margin = size - margin + 1; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    if (input.is_ascii()) {
+      input.store((int8_t *)latin1_output);
+      latin1_output += 64;
+      pos += 64;
+    } else {
+      // you might think that a for-loop would work, but under Visual Studio, it
+      // is not good enough.
+      uint64_t utf8_continuation_mask =
+          input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
+                             // this case, we also have ASCII to account for.
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times.
+      while (pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        size_t consumed = convert_masked_utf8_to_latin1(
+            in + pos, utf8_end_of_code_point_mask, latin1_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block. These bytes will be processed again. So we have an
+      // 80% efficiency (in the worst case). In practice we expect an
+      // 85% to 90% efficiency.
+    }
   }
-  return count;
-}
-
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
-    const char16_t *src, size_t len) const noexcept {
-  return rvv_utf8_length_from_utf16<simdutf_ByteFlip::NONE>(src, len);
+  if (pos < size) {
+    size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
+                                                           latin1_output);
+    latin1_output += howmany;
+  }
+  return latin1_output - start;
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
-    const char16_t *src, size_t len) const noexcept {
-  if (supports_zvbb())
-    return rvv_utf8_length_from_utf16<simdutf_ByteFlip::ZVBB>(src, len);
-  else
-    return rvv_utf8_length_from_utf16<simdutf_ByteFlip::V>(src, len);
-}
+} // namespace utf8_to_latin1
+} // namespace
+} // namespace lsx
+} // namespace simdutf
+  // namespace simdutf
+/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
+// transcoding from UTF-8 to UTF-16
+/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
-    const char32_t *src, size_t len) const noexcept {
-  size_t count = 0;
-  for (size_t vl; len > 0; len -= vl, src += vl) {
-    vl = __riscv_vsetvl_e32m8(len);
-    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
-    vbool4_t m234 = __riscv_vmsgtu_vx_u32m8_b4(v, 0x7F, vl);
-    vbool4_t m34 = __riscv_vmsgtu_vx_u32m8_b4(v, 0x7FF, vl);
-    vbool4_t m4 = __riscv_vmsgtu_vx_u32m8_b4(v, 0xFFFF, vl);
-    count += vl + __riscv_vcpop_m_b4(m234, vl) + __riscv_vcpop_m_b4(m34, vl) +
-             __riscv_vcpop_m_b4(m4, vl);
-  }
-  return count;
-}
+namespace simdutf {
+namespace lsx {
+namespace {
+namespace utf8_to_utf16 {
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
-    const char *src, size_t len) const noexcept {
-  size_t count = 0;
-  for (size_t vl; len > 0; len -= vl, src += vl) {
-    vl = __riscv_vsetvl_e8m8(len);
-    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
-    vbool1_t m1234 = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl);
-    vbool1_t m4 = __riscv_vmsgtu_vx_u8m8_b1(__riscv_vreinterpret_u8m8(v),
-                                            (uint8_t)0b11101111, vl);
-    count += __riscv_vcpop_m_b1(m1234, vl) + __riscv_vcpop_m_b1(m4, vl);
-  }
-  return count;
-}
+using namespace simd;
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
-    const char32_t *src, size_t len) const noexcept {
-  size_t count = 0;
-  for (size_t vl; len > 0; len -= vl, src += vl) {
-    vl = __riscv_vsetvl_e32m8(len);
-    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
-    vbool4_t m4 = __riscv_vmsgtu_vx_u32m8_b4(v, 0xFFFF, vl);
-    count += vl + __riscv_vcpop_m_b4(m4, vl);
+template <endianness endian>
+simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
+                                         char16_t *utf16_output) noexcept {
+  // The implementation is not specific to haswell and should be moved to the
+  // generic directory.
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    // this loop could be unrolled further. For example, we could process the
+    // mask far more than 64 bytes.
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if (in.is_ascii()) {
+      in.store_ascii_as_utf16<endian>(utf16_output);
+      utf16_output += 64;
+      pos += 64;
+    } else {
+      // Slow path. We hope that the compiler will recognize that this is a slow
+      // path. Anything that is not a continuation mask is a 'leading byte',
+      // that is, the start of a new code point.
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+      // -65 is 0b10111111 in two-complement's, so largest possible continuation
+      // byte
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      // The *start* of code points is not so useful, rather, we want the *end*
+      // of code points.
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times when using solely
+      // the slow/regular path, and at least four times if there are fast paths.
+      while (pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        //
+        // Thus we may allow convert_masked_utf8_to_utf16 to process
+        // more bytes at a time under a fast-path mode where 16 bytes
+        // are consumed at once (e.g., when encountering ASCII).
+        size_t consumed = convert_masked_utf8_to_utf16<endian>(
+            input + pos, utf8_end_of_code_point_mask, utf16_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block. These bytes will be processed again. So we have an
+      // 80% efficiency (in the worst case). In practice we expect an
+      // 85% to 90% efficiency.
+    }
   }
-  return count;
+  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
+      input + pos, size - pos, utf16_output);
+  return utf16_output - start;
 }
-/* end file src/rvv/rvv_length_from.inl.cpp */
-/* begin file src/rvv/rvv_validate.inl.cpp */
 
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 
-simdutf_warn_unused bool
-implementation::validate_ascii(const char *src, size_t len) const noexcept {
-  size_t vlmax = __riscv_vsetvlmax_e8m8();
-  vint8m8_t mask = __riscv_vmv_v_x_i8m8(0, vlmax);
-  for (size_t vl; len > 0; len -= vl, src += vl) {
-    vl = __riscv_vsetvl_e8m8(len);
-    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
-    mask = __riscv_vor_vv_i8m8_tu(mask, mask, v, vl);
-  }
-  return __riscv_vfirst_m_b1(__riscv_vmslt_vx_i8m8_b1(mask, 0, vlmax), vlmax) <
-         0;
-}
+namespace simdutf {
+namespace lsx {
+namespace {
+namespace utf8_to_utf16 {
+using namespace simd;
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(
-    const char *src, size_t len) const noexcept {
-  const char *beg = src;
-  for (size_t vl; len > 0; len -= vl, src += vl) {
-    vl = __riscv_vsetvl_e8m8(len);
-    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
-    long idx = __riscv_vfirst_m_b1(__riscv_vmslt_vx_i8m8_b1(v, 0, vl), vl);
-    if (idx >= 0)
-      return result(error_code::TOO_LARGE, src - beg + idx);
-  }
-  return result(error_code::SUCCESS, src - beg);
-}
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
-/* Returns a close estimation of the number of valid UTF-8 bytes up to the
- * first invalid one, but never overestimating. */
-simdutf_really_inline static size_t rvv_count_valid_utf8(const char *src,
-                                                         size_t len) {
-  const char *beg = src;
-  if (len < 32)
-    return 0;
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
 
-  /* validate first three bytes */
-  {
-    size_t idx = 3;
-    while (idx < len && (src[idx] >> 6) == 0b10)
-      ++idx;
-    if (idx > 3 + 3 || !scalar::utf8::validate(src, idx))
-      return 0;
-  }
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
 
-  static const uint64_t err1m[] = {0x0202020202020202, 0x4915012180808080};
-  static const uint64_t err2m[] = {0xCBCBCB8B8383A3E7, 0xCBCBDBCBCBCBCBCB};
-  static const uint64_t err3m[] = {0x0101010101010101, 0X01010101BABAAEE6};
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
 
-  const vuint8m1_t err1tbl =
-      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2));
-  const vuint8m1_t err2tbl =
-      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2));
-  const vuint8m1_t err3tbl =
-      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2));
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
 
-  size_t tail = 3;
-  size_t n = len - tail;
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
+}
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
+}
 
-  for (size_t vl; n > 0; n -= vl, src += vl) {
-    vl = __riscv_vsetvl_e8m4(n);
-    vuint8m4_t v0 = __riscv_vle8_v_u8m4((uint8_t const *)src, vl);
+struct validating_transcoder {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
 
-    uint8_t next0 = src[vl + 0];
-    uint8_t next1 = src[vl + 1];
-    uint8_t next2 = src[vl + 2];
+  validating_transcoder() : error(uint8_t(0)) {}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
+  }
 
-    /* fast path: ASCII */
-    if (__riscv_vfirst_m_b2(__riscv_vmsgtu_vx_u8m4_b2(v0, 0b01111111, vl), vl) <
-            0 &&
-        (next0 | next1 | next2) < 0b10000000)
-      continue;
+  template <endianness endian>
+  simdutf_really_inline size_t convert(const char *in, size_t size,
+                                       char16_t *utf16_output) {
+    size_t pos = 0;
+    char16_t *start{utf16_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf16<endian>(utf16_output);
+        utf16_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (utf8_continuation_mask & 1) {
+          return 0; // error
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf16<endian>(
+              in + pos, utf8_end_of_code_point_mask, utf16_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      return 0;
+    }
+    if (pos < size) {
+      size_t howmany = scalar::utf8_to_utf16::convert<endian>(
+          in + pos, size - pos, utf16_output);
+      if (howmany == 0) {
+        return 0;
+      }
+      utf16_output += howmany;
+    }
+    return utf16_output - start;
+  }
 
-    /* see "Validating UTF-8 In Less Than One Instruction Per Byte"
-     * https://arxiv.org/abs/2010.03090 */
-    vuint8m4_t v1 = __riscv_vslide1down_vx_u8m4(v0, next0, vl);
-    vuint8m4_t v2 = __riscv_vslide1down_vx_u8m4(v1, next1, vl);
-    vuint8m4_t v3 = __riscv_vslide1down_vx_u8m4(v2, next2, vl);
+  template <endianness endian>
+  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+                                                   char16_t *utf16_output) {
+    size_t pos = 0;
+    char16_t *start{utf16_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf16<endian>(utf16_output);
+        utf16_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (errors() || (utf8_continuation_mask & 1)) {
+          // rewind_and_convert_with_errors will seek a potential error from
+          // in+pos onward, with the ability to go back up to pos bytes, and
+          // read size-pos bytes forward.
+          result res =
+              scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+                  pos, in + pos, size - pos, utf16_output);
+          res.count += pos;
+          return res;
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf16<endian>(
+              in + pos, utf8_end_of_code_point_mask, utf16_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res =
+          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+              pos, in + pos, size - pos, utf16_output);
+      res.count += pos;
+      return res;
+    }
+    if (pos < size) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res =
+          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+              pos, in + pos, size - pos, utf16_output);
+      if (res.error) { // In case of error, we want the error position
+        res.count += pos;
+        return res;
+      } else { // In case of success, we want the number of word written
+        utf16_output += res.count;
+      }
+    }
+    return result(error_code::SUCCESS, utf16_output - start);
+  }
 
-    vuint8m4_t s1 = __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vsrl_vx_u16m4(
-        __riscv_vreinterpret_v_u8m4_u16m4(v2), 4, __riscv_vsetvlmax_e16m4()));
-    vuint8m4_t s3 = __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vsrl_vx_u16m4(
-        __riscv_vreinterpret_v_u8m4_u16m4(v3), 4, __riscv_vsetvlmax_e16m4()));
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
 
-    vuint8m4_t idx2 = __riscv_vand_vx_u8m4(v2, 0xF, vl);
-    vuint8m4_t idx1 = __riscv_vand_vx_u8m4(s1, 0xF, vl);
-    vuint8m4_t idx3 = __riscv_vand_vx_u8m4(s3, 0xF, vl);
+}; // struct utf8_checker
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+// transcoding from UTF-8 to UTF-32
+/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
 
-    vuint8m4_t err1 = simdutf_vrgather_u8m1x4(err1tbl, idx1);
-    vuint8m4_t err2 = simdutf_vrgather_u8m1x4(err2tbl, idx2);
-    vuint8m4_t err3 = simdutf_vrgather_u8m1x4(err3tbl, idx3);
-    vint8m4_t errs = __riscv_vreinterpret_v_u8m4_i8m4(
-        __riscv_vand_vv_u8m4(__riscv_vand_vv_u8m4(err1, err2, vl), err3, vl));
+namespace simdutf {
+namespace lsx {
+namespace {
+namespace utf8_to_utf32 {
 
-    vbool2_t is_3 = __riscv_vmsgtu_vx_u8m4_b2(v1, 0b11100000 - 1, vl);
-    vbool2_t is_4 = __riscv_vmsgtu_vx_u8m4_b2(v0, 0b11110000 - 1, vl);
-    vbool2_t is_34 = __riscv_vmor_mm_b2(is_3, is_4, vl);
-    vbool2_t err34 =
-        __riscv_vmxor_mm_b2(is_34, __riscv_vmslt_vx_i8m4_b2(errs, 0, vl), vl);
-    vbool2_t errm =
-        __riscv_vmor_mm_b2(__riscv_vmsgt_vx_i8m4_b2(errs, 0, vl), err34, vl);
-    if (__riscv_vfirst_m_b2(errm, vl) >= 0)
-      break;
-  }
+using namespace simd;
 
-  /* we need to validate the last character */
-  while (tail < len && (src[0] >> 6) == 0b10)
-    --src, ++tail;
-  return src - beg;
+simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
+                                         char32_t *utf32_output) noexcept {
+  size_t pos = 0;
+  char32_t *start{utf32_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if (in.is_ascii()) {
+      in.store_ascii_as_utf32(utf32_output);
+      utf32_output += 64;
+      pos += 64;
+    } else {
+      // -65 is 0b10111111 in two-complement's, so largest possible continuation
+      // byte
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      size_t max_starting_point = (pos + 64) - 12;
+      while (pos < max_starting_point) {
+        size_t consumed = convert_masked_utf8_to_utf32(
+            input + pos, utf8_end_of_code_point_mask, utf32_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+    }
+  }
+  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
+                                                       utf32_output);
+  return utf32_output - start;
 }
 
-simdutf_warn_unused bool
-implementation::validate_utf8(const char *src, size_t len) const noexcept {
-  size_t count = rvv_count_valid_utf8(src, len);
-  return scalar::utf8::validate(src + count, len - count);
-}
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
+/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(
-    const char *src, size_t len) const noexcept {
-  size_t count = rvv_count_valid_utf8(src, len);
-  result res = scalar::utf8::validate_with_errors(src + count, len - count);
-  return result(res.error, count + res.count);
-}
+namespace simdutf {
+namespace lsx {
+namespace {
+namespace utf8_to_utf32 {
+using namespace simd;
 
-simdutf_warn_unused bool
-implementation::validate_utf16le(const char16_t *src,
-                                 size_t len) const noexcept {
-  return validate_utf16le_with_errors(src, len).error == error_code::SUCCESS;
-}
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
-simdutf_warn_unused bool
-implementation::validate_utf16be(const char16_t *src,
-                                 size_t len) const noexcept {
-  return validate_utf16be_with_errors(src, len).error == error_code::SUCCESS;
-}
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
 
-template <simdutf_ByteFlip bflip>
-simdutf_really_inline static result
-rvv_validate_utf16_with_errors(const char16_t *src, size_t len) {
-  const char16_t *beg = src;
-  uint16_t last = 0;
-  for (size_t vl; len > 0;
-       len -= vl, src += vl, last = simdutf_byteflip<bflip>(src[-1])) {
-    vl = __riscv_vsetvl_e16m8(len);
-    vuint16m8_t v1 = __riscv_vle16_v_u16m8((const uint16_t *)src, vl);
-    v1 = simdutf_byteflip<bflip>(v1, vl);
-    vuint16m8_t v0 = __riscv_vslide1up_vx_u16m8(v1, last, vl);
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
 
-    vbool2_t surhi = __riscv_vmseq_vx_u16m8_b2(
-        __riscv_vand_vx_u16m8(v0, 0xFC00, vl), 0xD800, vl);
-    vbool2_t surlo = __riscv_vmseq_vx_u16m8_b2(
-        __riscv_vand_vx_u16m8(v1, 0xFC00, vl), 0xDC00, vl);
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
 
-    long idx = __riscv_vfirst_m_b2(__riscv_vmxor_mm_b2(surhi, surlo, vl), vl);
-    if (idx >= 0) {
-      last = idx > 0 ? simdutf_byteflip<bflip>(src[idx - 1]) : last;
-      return result(error_code::SURROGATE,
-                    src - beg + idx - (last - 0xD800u < 0x400u));
-      break;
-    }
-  }
-  if (last - 0xD800u < 0x400u) {
-    return result(error_code::SURROGATE,
-                  src - beg - 1); /* end on high surrogate */
-  } else {
-    return result(error_code::SUCCESS, src - beg);
-  }
-}
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(
-    const char16_t *src, size_t len) const noexcept {
-  return rvv_validate_utf16_with_errors<simdutf_ByteFlip::NONE>(src, len);
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
 }
-
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(
-    const char16_t *src, size_t len) const noexcept {
-  if (supports_zvbb())
-    return rvv_validate_utf16_with_errors<simdutf_ByteFlip::ZVBB>(src, len);
-  else
-    return rvv_validate_utf16_with_errors<simdutf_ByteFlip::V>(src, len);
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
 }
 
-simdutf_warn_unused bool
-implementation::validate_utf32(const char32_t *src, size_t len) const noexcept {
-  size_t vlmax = __riscv_vsetvlmax_e32m8();
-  vuint32m8_t max = __riscv_vmv_v_x_u32m8(0x10FFFF, vlmax);
-  vuint32m8_t maxOff = __riscv_vmv_v_x_u32m8(0xFFFFF7FF, vlmax);
-  for (size_t vl; len > 0; len -= vl, src += vl) {
-    vl = __riscv_vsetvl_e32m8(len);
-    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
-    vuint32m8_t off = __riscv_vadd_vx_u32m8(v, 0xFFFF2000, vl);
-    max = __riscv_vmaxu_vv_u32m8_tu(max, max, v, vl);
-    maxOff = __riscv_vmaxu_vv_u32m8_tu(maxOff, maxOff, off, vl);
+struct validating_transcoder {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
+
+  validating_transcoder() : error(uint8_t(0)) {}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
   }
-  return __riscv_vfirst_m_b4(
-             __riscv_vmor_mm_b4(
-                 __riscv_vmsne_vx_u32m8_b4(max, 0x10FFFF, vlmax),
-                 __riscv_vmsne_vx_u32m8_b4(maxOff, 0xFFFFF7FF, vlmax), vlmax),
-             vlmax) < 0;
-}
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(
-    const char32_t *src, size_t len) const noexcept {
-  const char32_t *beg = src;
-  for (size_t vl; len > 0; len -= vl, src += vl) {
-    vl = __riscv_vsetvl_e32m8(len);
-    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
-    vuint32m8_t off = __riscv_vadd_vx_u32m8(v, 0xFFFF2000, vl);
-    long idx1 =
-        __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 0x10FFFF, vl), vl);
-    long idx2 = __riscv_vfirst_m_b4(
-        __riscv_vmsgtu_vx_u32m8_b4(off, 0xFFFFF7FF, vl), vl);
-    if (idx1 >= 0 && idx2 >= 0) {
-      if (idx1 <= idx2) {
-        return result(error_code::TOO_LARGE, src - beg + idx1);
+  simdutf_really_inline size_t convert(const char *in, size_t size,
+                                       char32_t *utf32_output) {
+    size_t pos = 0;
+    char32_t *start{utf32_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 16 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the fourth
+    // last leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf32(utf32_output);
+        utf32_output += 64;
+        pos += 64;
       } else {
-        return result(error_code::SURROGATE, src - beg + idx2);
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (utf8_continuation_mask & 1) {
+          return 0; // we have an error
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf32(
+              in + pos, utf8_end_of_code_point_mask, utf32_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
       }
     }
-    if (idx1 >= 0) {
-      return result(error_code::TOO_LARGE, src - beg + idx1);
+    if (errors()) {
+      return 0;
     }
-    if (idx2 >= 0) {
-      return result(error_code::SURROGATE, src - beg + idx2);
+    if (pos < size) {
+      size_t howmany =
+          scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+      if (howmany == 0) {
+        return 0;
+      }
+      utf32_output += howmany;
     }
+    return utf32_output - start;
   }
-  return result(error_code::SUCCESS, src - beg);
-}
-/* end file src/rvv/rvv_validate.inl.cpp */
-
-/* begin file src/rvv/rvv_latin1_to.inl.cpp */
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
-    const char *src, size_t len, char *dst) const noexcept {
-  char *beg = dst;
-  for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) {
-    vl = __riscv_vsetvl_e8m2(len);
-    vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
-    vbool4_t nascii =
-        __riscv_vmslt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v1), 0, vl);
-    size_t cnt = __riscv_vcpop_m_b4(nascii, vl);
-    vlOut = vl + cnt;
-    if (cnt == 0) {
-      __riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut);
-      continue;
+  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+                                                   char32_t *utf32_output) {
+    size_t pos = 0;
+    char32_t *start{utf32_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
     }
-
-    vuint8m2_t v0 =
-        __riscv_vor_vx_u8m2(__riscv_vsrl_vx_u8m2(v1, 6, vl), 0b11000000, vl);
-    v1 = __riscv_vand_vx_u8m2_mu(nascii, v1, v1, 0b10111111, vl);
-
-    vuint8m4_t wide =
-        __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vwmaccu_vx_u16m4(
-            __riscv_vwaddu_vv_u16m4(v0, v1, vl), 0xFF, v1, vl));
-    vbool2_t mask = __riscv_vmsgtu_vx_u8m4_b2(
-        __riscv_vsub_vx_u8m4(wide, 0b11000000, vl * 2), 1, vl * 2);
-    vuint8m4_t comp = __riscv_vcompress_vm_u8m4(wide, mask, vl * 2);
-
-    __riscv_vse8_v_u8m4((uint8_t *)dst, comp, vlOut);
-  }
-  return dst - beg;
-}
-
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
-    const char *src, size_t len, char16_t *dst) const noexcept {
-  char16_t *beg = dst;
-  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
-    vl = __riscv_vsetvl_e8m4(len);
-    vuint8m4_t v = __riscv_vle8_v_u8m4((uint8_t *)src, vl);
-    __riscv_vse16_v_u16m8((uint16_t *)dst, __riscv_vzext_vf2_u16m8(v, vl), vl);
-  }
-  return dst - beg;
-}
-
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
-    const char *src, size_t len, char16_t *dst) const noexcept {
-  char16_t *beg = dst;
-  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
-    vl = __riscv_vsetvl_e8m4(len);
-    vuint8m4_t v = __riscv_vle8_v_u8m4((uint8_t *)src, vl);
-    __riscv_vse16_v_u16m8(
-        (uint16_t *)dst,
-        __riscv_vsll_vx_u16m8(__riscv_vzext_vf2_u16m8(v, vl), 8, vl), vl);
+    // If the input is long enough, then we have that margin-1 is the fourth
+    // last leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf32(utf32_output);
+        utf32_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (errors() || (utf8_continuation_mask & 1)) {
+          result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+              pos, in + pos, size - pos, utf32_output);
+          res.count += pos;
+          return res;
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf32(
+              in + pos, utf8_end_of_code_point_mask, utf32_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, utf32_output);
+      res.count += pos;
+      return res;
+    }
+    if (pos < size) {
+      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, utf32_output);
+      if (res.error) { // In case of error, we want the error position
+        res.count += pos;
+        return res;
+      } else { // In case of success, we want the number of word written
+        utf32_output += res.count;
+      }
+    }
+    return result(error_code::SUCCESS, utf32_output - start);
   }
-  return dst - beg;
-}
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
-    const char *src, size_t len, char32_t *dst) const noexcept {
-  char32_t *beg = dst;
-  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
-    vl = __riscv_vsetvl_e8m2(len);
-    vuint8m2_t v = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
-    __riscv_vse32_v_u32m8((uint32_t *)dst, __riscv_vzext_vf4_u32m8(v, vl), vl);
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
   }
-  return dst - beg;
-}
-/* end file src/rvv/rvv_latin1_to.inl.cpp */
-/* begin file src/rvv/rvv_utf16_to.inl.cpp */
-#include <cstdio>
 
-template <simdutf_ByteFlip bflip>
-simdutf_really_inline static result
-rvv_utf16_to_latin1_with_errors(const char16_t *src, size_t len, char *dst) {
-  const char16_t *const beg = src;
-  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
-    vl = __riscv_vsetvl_e16m8(len);
-    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
-    v = simdutf_byteflip<bflip>(v, vl);
-    long idx = __riscv_vfirst_m_b2(__riscv_vmsgtu_vx_u16m8_b2(v, 255, vl), vl);
-    if (idx >= 0)
-      return result(error_code::TOO_LARGE, src - beg + idx);
-    __riscv_vse8_v_u8m4((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m4(v, vl), vl);
-  }
-  return result(error_code::SUCCESS, src - beg);
-}
+}; // struct utf8_checker
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  result res = convert_utf16le_to_latin1_with_errors(src, len, dst);
-  return res.error == error_code::SUCCESS ? res.count : 0;
-}
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  result res = convert_utf16be_to_latin1_with_errors(src, len, dst);
-  return res.error == error_code::SUCCESS ? res.count : 0;
-}
+// other functions
+/* begin file src/generic/utf8.h */
 
-simdutf_warn_unused result
-implementation::convert_utf16le_to_latin1_with_errors(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  return rvv_utf16_to_latin1_with_errors<simdutf_ByteFlip::NONE>(src, len, dst);
-}
+namespace simdutf {
+namespace lsx {
+namespace {
+namespace utf8 {
 
-simdutf_warn_unused result
-implementation::convert_utf16be_to_latin1_with_errors(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  if (supports_zvbb())
-    return rvv_utf16_to_latin1_with_errors<simdutf_ByteFlip::ZVBB>(src, len,
-                                                                   dst);
-  else
-    return rvv_utf16_to_latin1_with_errors<simdutf_ByteFlip::V>(src, len, dst);
-}
+using namespace simd;
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  const char16_t *const beg = src;
-  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
-    vl = __riscv_vsetvl_e16m8(len);
-    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
-    __riscv_vse8_v_u8m4((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m4(v, vl), vl);
+simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos + 64 <= size; pos += 64) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    uint64_t utf8_continuation_mask = input.gt(-65);
+    count += count_ones(utf8_continuation_mask);
   }
-  return src - beg;
+  return count + scalar::utf8::count_code_points(in + pos, size - pos);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  const char16_t *const beg = src;
-  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
-    vl = __riscv_vsetvl_e16m8(len);
-    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
-    __riscv_vse8_v_u8m4((uint8_t *)dst, __riscv_vnsrl_wx_u8m4(v, 8, vl), vl);
+simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
+                                                    size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  // This algorithm could no doubt be improved!
+  for (; pos + 64 <= size; pos += 64) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+    // We count one word for anything that is not a continuation (so
+    // leading bytes).
+    count += 64 - count_ones(utf8_continuation_mask);
+    int64_t utf8_4byte = input.gteq_unsigned(240);
+    count += count_ones(utf8_4byte);
   }
-  return src - beg;
+  return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
 }
+} // namespace utf8
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
+/* end file src/generic/utf8.h */
+/* begin file src/generic/utf16.h */
+namespace simdutf {
+namespace lsx {
+namespace {
+namespace utf16 {
 
-template <simdutf_ByteFlip bflip>
-simdutf_really_inline static result
-rvv_utf16_to_utf8_with_errors(const char16_t *src, size_t len, char *dst) {
-  size_t n = len;
-  const char16_t *srcBeg = src;
-  const char *dstBeg = dst;
-  size_t vl8m4 = __riscv_vsetvlmax_e8m4();
-  vbool2_t m4mulp2 = __riscv_vmseq_vx_u8m4_b2(
-      __riscv_vand_vx_u8m4(__riscv_vid_v_u8m4(vl8m4), 3, vl8m4), 2, vl8m4);
-
-  for (size_t vl, vlOut; n > 0;) {
-    vl = __riscv_vsetvl_e16m2(n);
-
-    vuint16m2_t v = __riscv_vle16_v_u16m2((uint16_t const *)src, vl);
-    v = simdutf_byteflip<bflip>(v, vl);
-    vbool8_t m234 = __riscv_vmsgtu_vx_u16m2_b8(v, 0x80 - 1, vl);
-
-    if (__riscv_vfirst_m_b8(m234, vl) < 0) { /* 1 byte utf8 */
-      vlOut = vl;
-      __riscv_vse8_v_u8m1((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m1(v, vlOut),
-                          vlOut);
-      n -= vl, src += vl, dst += vlOut;
-      continue;
+template <endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t *in,
+                                               size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos < size / 32 * 32; pos += 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    if (!match_system(big_endian)) {
+      input.swap_bytes();
     }
+    uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+    count += count_ones(not_pair) / 2;
+  }
+  return count +
+         scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
+}
 
-    vbool8_t m34 = __riscv_vmsgtu_vx_u16m2_b8(v, 0x800 - 1, vl);
-
-    if (__riscv_vfirst_m_b8(m34, vl) < 0) { /* 1/2 byte utf8 */
-      /* 0: [     aaa|aabbbbbb]
-       * 1: [aabbbbbb|        ] vsll 8
-       * 2: [        |   aaaaa] vsrl 6
-       * 3: [00111111|00011111]
-       * 4: [  bbbbbb|000aaaaa] (1|2)&3
-       * 5: [11000000|11000000]
-       * 6: [10bbbbbb|110aaaaa] 4|5 */
-      vuint16m2_t twoByte = __riscv_vand_vx_u16m2(
-          __riscv_vor_vv_u16m2(__riscv_vsll_vx_u16m2(v, 8, vl),
-                               __riscv_vsrl_vx_u16m2(v, 6, vl), vl),
-          0b0011111100011111, vl);
-      vuint16m2_t vout16 =
-          __riscv_vor_vx_u16m2_mu(m234, v, twoByte, 0b1000000011000000, vl);
-      vuint8m2_t vout = __riscv_vreinterpret_v_u16m2_u8m2(vout16);
-
-      /* Every high byte that is zero should be compressed
-       * low bytes should never be compressed, so we set them
-       * to all ones, and then create a non-zero bytes mask */
-      vbool4_t mcomp =
-          __riscv_vmsne_vx_u8m2_b4(__riscv_vreinterpret_v_u16m2_u8m2(
-                                       __riscv_vor_vx_u16m2(vout16, 0xFF, vl)),
-                                   0, vl * 2);
-      vlOut = __riscv_vcpop_m_b4(mcomp, vl * 2);
-
-      vout = __riscv_vcompress_vm_u8m2(vout, mcomp, vl * 2);
-      __riscv_vse8_v_u8m2((uint8_t *)dst, vout, vlOut);
-
-      n -= vl, src += vl, dst += vlOut;
-      continue;
+template <endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in,
+                                                    size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  // This algorithm could no doubt be improved!
+  for (; pos < size / 32 * 32; pos += 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    if (!match_system(big_endian)) {
+      input.swap_bytes();
     }
+    uint64_t ascii_mask = input.lteq(0x7F);
+    uint64_t twobyte_mask = input.lteq(0x7FF);
+    uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
 
-    vbool8_t sur = __riscv_vmseq_vx_u16m2_b8(
-        __riscv_vand_vx_u16m2(v, 0xF800, vl), 0xD800, vl);
-    long first = __riscv_vfirst_m_b8(sur, vl);
-    size_t tail = vl - first;
-    vl = first < 0 ? vl : first;
-
-    if (vl > 0) { /* 1/2/3 byte utf8 */
-      /* in: [aaaabbbb|bbcccccc]
-       * v1: [0bcccccc|        ] vsll  8
-       * v1: [10cccccc|        ] vsll  8 & 0b00111111 | 0b10000000
-       * v2: [        |110bbbbb] vsrl  6 & 0b00111111 | 0b11000000
-       * v2: [        |10bbbbbb] vsrl  6 & 0b00111111 | 0b10000000
-       * v3: [        |1110aaaa] vsrl 12 | 0b11100000
-       *  1: [00000000|0bcccccc|00000000|00000000] => [0bcccccc]
-       *  2: [00000000|10cccccc|110bbbbb|00000000] => [110bbbbb] [10cccccc]
-       *  3: [00000000|10cccccc|10bbbbbb|1110aaaa] => [1110aaaa] [10bbbbbb]
-       * [10cccccc]
-       */
-      vuint16m2_t v1, v2, v3, v12;
-      v1 = __riscv_vor_vx_u16m2_mu(
-          m234, v, __riscv_vand_vx_u16m2(v, 0b00111111, vl), 0b10000000, vl);
-      v1 = __riscv_vsll_vx_u16m2(v1, 8, vl);
-
-      v2 = __riscv_vor_vx_u16m2(
-          __riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 6, vl), 0b00111111,
-                                vl),
-          0b10000000, vl);
-      v2 = __riscv_vor_vx_u16m2_mu(__riscv_vmnot_m_b8(m34, vl), v2, v2,
-                                   0b01000000, vl);
-      v3 = __riscv_vor_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 12, vl), 0b11100000,
-                                vl);
-      v12 = __riscv_vor_vv_u16m2_mu(m234, v1, v1, v2, vl);
+    size_t ascii_count = count_ones(ascii_mask) / 2;
+    size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
+    size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
+    size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+    count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count +
+             ascii_count;
+  }
+  return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
+                                                                   size - pos);
+}
 
-      vuint32m4_t w12 = __riscv_vwmulu_vx_u32m4(v12, 1 << 8, vl);
-      vuint32m4_t w123 = __riscv_vwaddu_wv_u32m4_mu(m34, w12, w12, v3, vl);
-      vuint8m4_t vout = __riscv_vreinterpret_v_u32m4_u8m4(w123);
+template <endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
+                                                     size_t size) {
+  return count_code_points<big_endian>(in, size);
+}
 
-      vbool2_t mcomp = __riscv_vmor_mm_b2(
-          m4mulp2, __riscv_vmsne_vx_u8m4_b2(vout, 0, vl * 4), vl * 4);
-      vlOut = __riscv_vcpop_m_b2(mcomp, vl * 4);
+simdutf_really_inline void
+change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
+  size_t pos = 0;
 
-      vout = __riscv_vcompress_vm_u8m4(vout, mcomp, vl * 4);
-      __riscv_vse8_v_u8m4((uint8_t *)dst, vout, vlOut);
+  while (pos < size / 32 * 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    input.swap_bytes();
+    input.store(reinterpret_cast<uint16_t *>(output));
+    pos += 32;
+    output += 32;
+  }
 
-      n -= vl, src += vl, dst += vlOut;
-    }
+  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+}
 
-    if (tail)
-      while (n) {
-        uint16_t word = simdutf_byteflip<bflip>(src[0]);
-        if ((word & 0xFF80) == 0) {
-          break;
-        } else if ((word & 0xF800) == 0) {
-          break;
-        } else if ((word & 0xF800) != 0xD800) {
-          break;
-        } else {
-          // must be a surrogate pair
-          if (n <= 1)
-            return result(error_code::SURROGATE, src - srcBeg);
-          uint16_t diff = word - 0xD800;
-          if (diff > 0x3FF)
-            return result(error_code::SURROGATE, src - srcBeg);
-          uint16_t diff2 = simdutf_byteflip<bflip>(src[1]) - 0xDC00;
-          if (diff2 > 0x3FF)
-            return result(error_code::SURROGATE, src - srcBeg);
+} // namespace utf16
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
+/* end file src/generic/utf16.h */
 
-          uint32_t value = ((diff + 0x40) << 10) + diff2;
+//
+// Implementation-specific overrides
+//
+namespace simdutf {
+namespace lsx {
 
-          // will generate four UTF-8 bytes
-          // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-          *dst++ = (char)((value >> 18) | 0b11110000);
-          *dst++ = (char)(((value >> 12) & 0b111111) | 0b10000000);
-          *dst++ = (char)(((value >> 6) & 0b111111) | 0b10000000);
-          *dst++ = (char)((value & 0b111111) | 0b10000000);
-          src += 2;
-          n -= 2;
-        }
-      }
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+                                 size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  // todo: reimplement as a one-pass algorithm.
+  if (bom_encoding != encoding_type::unspecified) {
+    return bom_encoding;
   }
-
-  return result(error_code::SUCCESS, dst - dstBeg);
+  int out = 0;
+  if (validate_utf8(input, length)) {
+    out |= encoding_type::UTF8;
+  }
+  if ((length % 2) == 0) {
+    if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
+                         length / 2)) {
+      out |= encoding_type::UTF16_LE;
+    }
+  }
+  if ((length % 4) == 0) {
+    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+      out |= encoding_type::UTF32_LE;
+    }
+  }
+  return out;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  result res = convert_utf16le_to_utf8_with_errors(src, len, dst);
-  return res.error == error_code::SUCCESS ? res.count : 0;
+simdutf_warn_unused bool
+implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return lsx::utf8_validation::generic_validate_utf8(buf, len);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  result res = convert_utf16be_to_utf8_with_errors(src, len, dst);
-  return res.error == error_code::SUCCESS ? res.count : 0;
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+    const char *buf, size_t len) const noexcept {
+  return lsx::utf8_validation::generic_validate_utf8_with_errors(buf, len);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  return rvv_utf16_to_utf8_with_errors<simdutf_ByteFlip::NONE>(src, len, dst);
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+  return lsx::utf8_validation::generic_validate_ascii(buf, len);
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  if (supports_zvbb())
-    return rvv_utf16_to_utf8_with_errors<simdutf_ByteFlip::ZVBB>(src, len, dst);
-  else
-    return rvv_utf16_to_utf8_with_errors<simdutf_ByteFlip::V>(src, len, dst);
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+    const char *buf, size_t len) const noexcept {
+  return lsx::utf8_validation::generic_validate_ascii_with_errors(buf, len);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  return convert_utf16le_to_utf8(src, len, dst);
+simdutf_warn_unused bool
+implementation::validate_utf16le(const char16_t *buf,
+                                 size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid. protected the implementation from nullptr.
+    return true;
+  }
+  const char16_t *tail = lsx_validate_utf16<endianness::LITTLE>(buf, len);
+  if (tail) {
+    return scalar::utf16::validate<endianness::LITTLE>(tail,
+                                                       len - (tail - buf));
+  } else {
+    return false;
+  }
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  return convert_utf16be_to_utf8(src, len, dst);
+simdutf_warn_unused bool
+implementation::validate_utf16be(const char16_t *buf,
+                                 size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid. protected the implementation from nullptr.
+    return true;
+  }
+  const char16_t *tail = lsx_validate_utf16<endianness::BIG>(buf, len);
+  if (tail) {
+    return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
 }
 
-template <simdutf_ByteFlip bflip>
-simdutf_really_inline static result
-rvv_utf16_to_utf32_with_errors(const char16_t *src, size_t len, char32_t *dst) {
-  const char16_t *const srcBeg = src;
-  char32_t *const dstBeg = dst;
-
-  constexpr const uint16_t ANY_SURROGATE_MASK = 0xf800;
-  constexpr const uint16_t ANY_SURROGATE_VALUE = 0xd800;
-  constexpr const uint16_t LO_SURROGATE_MASK = 0xfc00;
-  constexpr const uint16_t LO_SURROGATE_VALUE = 0xdc00;
-  constexpr const uint16_t HI_SURROGATE_MASK = 0xfc00;
-  constexpr const uint16_t HI_SURROGATE_VALUE = 0xd800;
-
-  uint16_t last = 0;
-  while (len > 0) {
-    size_t vl = __riscv_vsetvl_e16m2(len);
-    vuint16m2_t v0 = __riscv_vle16_v_u16m2((uint16_t const *)src, vl);
-    v0 = simdutf_byteflip<bflip>(v0, vl);
-
-    { // check fast-path
-      const vuint16m2_t v = __riscv_vand_vx_u16m2(v0, ANY_SURROGATE_MASK, vl);
-      const vbool8_t any_surrogate =
-          __riscv_vmseq_vx_u16m2_b8(v, ANY_SURROGATE_VALUE, vl);
-      if (__riscv_vfirst_m_b8(any_surrogate, vl) < 0) {
-        /* no surrogates */
-        __riscv_vse32_v_u32m4((uint32_t *)dst, __riscv_vzext_vf2_u32m4(v0, vl),
-                              vl);
-        len -= vl;
-        src += vl;
-        dst += vl;
-        continue;
-      }
-    }
-
-    if ((simdutf_byteflip<bflip>(src[0]) & LO_SURROGATE_MASK) ==
-        LO_SURROGATE_VALUE) {
-      return result(error_code::SURROGATE, src - srcBeg);
-    }
-
-    // decode surrogates
-    vuint16m2_t v1 = __riscv_vslide1down_vx_u16m2(v0, 0, vl);
-    vl = __riscv_vsetvl_e16m2(vl - 1);
-    if (vl == 0) {
-      return result(error_code::SURROGATE, src - srcBeg);
-    }
-
-    const vbool8_t surhi = __riscv_vmseq_vx_u16m2_b8(
-        __riscv_vand_vx_u16m2(v0, HI_SURROGATE_MASK, vl), HI_SURROGATE_VALUE,
-        vl);
-    const vbool8_t surlo = __riscv_vmseq_vx_u16m2_b8(
-        __riscv_vand_vx_u16m2(v1, LO_SURROGATE_MASK, vl), LO_SURROGATE_VALUE,
-        vl);
-
-    // compress everything but lo surrogates
-    const vbool8_t compress = __riscv_vmsne_vx_u16m2_b8(
-        __riscv_vand_vx_u16m2(v0, LO_SURROGATE_MASK, vl), LO_SURROGATE_VALUE,
-        vl);
-
-    {
-      const vbool8_t diff = __riscv_vmxor_mm_b8(surhi, surlo, vl);
-      const long idx = __riscv_vfirst_m_b8(diff, vl);
-      if (idx >= 0) {
-        uint16_t word = simdutf_byteflip<bflip>(src[idx]);
-        if (word < 0xD800 || word > 0xDBFF) {
-          return result(error_code::SURROGATE, src - srcBeg + idx + 1);
-        }
-        return result(error_code::SURROGATE, src - srcBeg + idx);
-      }
-    }
-
-    last = simdutf_byteflip<bflip>(src[vl]);
-    vuint32m4_t utf32 = __riscv_vzext_vf2_u32m4(v0, vl);
-
-    // v0 = 110110yyyyyyyyyy (0xd800 + yyyyyyyyyy) --- hi surrogate
-    // v1 = 110111xxxxxxxxxx (0xdc00 + xxxxxxxxxx) --- lo surrogate
-
-    // t0 = u16(                    0000_00yy_yyyy_yyyy)
-    const vuint32m4_t t0 =
-        __riscv_vzext_vf2_u32m4(__riscv_vand_vx_u16m2(v0, 0x03ff, vl), vl);
-    // t1 = u32(0000_0000_0000_yyyy_yyyy_yy00_0000_0000)
-    const vuint32m4_t t1 = __riscv_vsll_vx_u32m4(t0, 10, vl);
-
-    // t2 = u32(0000_0000_0000_0000_0000_00xx_xxxx_xxxx)
-    const vuint32m4_t t2 =
-        __riscv_vzext_vf2_u32m4(__riscv_vand_vx_u16m2(v1, 0x03ff, vl), vl);
-
-    // t3 = u32(0000_0000_0000_yyyy_yyyy_yyxx_xxxx_xxxx)
-    const vuint32m4_t t3 = __riscv_vor_vv_u32m4(t1, t2, vl);
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
+  result res = lsx_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+  if (res.count != len) {
+    result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(
+        buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
+}
 
-    // t4 = utf32 from surrogate pairs
-    const vuint32m4_t t4 = __riscv_vadd_vx_u32m4(t3, 0x10000, vl);
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
+  result res = lsx_validate_utf16_with_errors<endianness::BIG>(buf, len);
+  if (res.count != len) {
+    result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(
+        buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
+}
 
-    const vuint32m4_t result = __riscv_vmerge_vvm_u32m4(utf32, t4, surhi, vl);
+simdutf_warn_unused bool
+implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid. protected the implementation from nullptr.
+    return true;
+  }
+  const char32_t *tail = lsx_validate_utf32le(buf, len);
+  if (tail) {
+    return scalar::utf32::validate(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
+}
 
-    const vuint32m4_t comp = __riscv_vcompress_vm_u32m4(result, compress, vl);
-    const size_t vlOut = __riscv_vcpop_m_b8(compress, vl);
-    __riscv_vse32_v_u32m4((uint32_t *)dst, comp, vlOut);
+simdutf_warn_unused result implementation::validate_utf32_with_errors(
+    const char32_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
+  result res = lsx_validate_utf32le_with_errors(buf, len);
+  if (res.count != len) {
+    result scalar_res =
+        scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
+}
 
-    len -= vl;
-    src += vl;
-    dst += vlOut;
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
+    const char *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char *, char *> ret =
+      lsx_convert_latin1_to_utf8(buf, len, utf8_output);
+  size_t converted_chars = ret.second - utf8_output;
 
-    if ((last & LO_SURROGATE_MASK) == LO_SURROGATE_VALUE) {
-      // last item is lo surrogate and got already consumed
-      len -= 1;
-      src += 1;
-    }
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    converted_chars += scalar_converted_chars;
   }
-
-  return result(error_code::SUCCESS, dst - dstBeg);
+  return converted_chars;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
-    const char16_t *src, size_t len, char32_t *dst) const noexcept {
-  result res = convert_utf16le_to_utf32_with_errors(src, len, dst);
-  return res.error == error_code::SUCCESS ? res.count : 0;
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char *, char16_t *> ret =
+      lsx_convert_latin1_to_utf16le(buf, len, utf16_output);
+  size_t converted_chars = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars =
+        scalar::latin1_to_utf16::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    converted_chars += scalar_converted_chars;
+  }
+  return converted_chars;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
-    const char16_t *src, size_t len, char32_t *dst) const noexcept {
-  result res = convert_utf16be_to_utf32_with_errors(src, len, dst);
-  return res.error == error_code::SUCCESS ? res.count : 0;
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char *, char16_t *> ret =
+      lsx_convert_latin1_to_utf16be(buf, len, utf16_output);
+  size_t converted_chars = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars =
+        scalar::latin1_to_utf16::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    converted_chars += scalar_converted_chars;
+  }
+  return converted_chars;
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
-    const char16_t *src, size_t len, char32_t *dst) const noexcept {
-  return rvv_utf16_to_utf32_with_errors<simdutf_ByteFlip::NONE>(src, len, dst);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::pair<const char *, char32_t *> ret =
+      lsx_convert_latin1_to_utf32(buf, len, utf32_output);
+  size_t converted_chars = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    converted_chars += scalar_converted_chars;
+  }
+  return converted_chars;
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
-    const char16_t *src, size_t len, char32_t *dst) const noexcept {
-  if (supports_zvbb())
-    return rvv_utf16_to_utf32_with_errors<simdutf_ByteFlip::ZVBB>(src, len,
-                                                                  dst);
-  else
-    return rvv_utf16_to_utf32_with_errors<simdutf_ByteFlip::V>(src, len, dst);
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  utf8_to_latin1::validating_transcoder converter;
+  return converter.convert(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
-    const char16_t *src, size_t len, char32_t *dst) const noexcept {
-  return convert_utf16le_to_utf32(src, len, dst);
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  utf8_to_latin1::validating_transcoder converter;
+  return converter.convert_with_errors(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
-    const char16_t *src, size_t len, char32_t *dst) const noexcept {
-  return convert_utf16be_to_utf32(src, len, dst);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  return lsx::utf8_to_latin1::convert_valid(buf, len, latin1_output);
 }
-/* end file src/rvv/rvv_utf16_to.inl.cpp */
-/* begin file src/rvv/rvv_utf32_to.inl.cpp */
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
-    const char32_t *src, size_t len, char *dst) const noexcept {
-  result res = convert_utf32_to_latin1_with_errors(src, len, dst);
-  return res.error == error_code::SUCCESS ? res.count : 0;
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
-    const char32_t *src, size_t len, char *dst) const noexcept {
-  const char32_t *const beg = src;
-  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
-    vl = __riscv_vsetvl_e32m8(len);
-    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
-    long idx = __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 255, vl), vl);
-    if (idx >= 0)
-      return result(error_code::TOO_LARGE, src - beg + idx);
-    /* We don't use vcompress here, because its performance varies widely on
-     * current platforms. This might be worth reconsidering once there is more
-     * hardware available. */
-    __riscv_vse8_v_u8m2(
-        (uint8_t *)dst,
-        __riscv_vncvt_x_x_w_u8m2(__riscv_vncvt_x_x_w_u16m4(v, vl), vl), vl);
-  }
-  return result(error_code::SUCCESS, src - beg);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
-    const char32_t *src, size_t len, char *dst) const noexcept {
-  return convert_utf32_to_latin1(src, len, dst);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert_with_errors<endianness::LITTLE>(buf, len,
+                                                           utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
-    const char32_t *src, size_t len, char *dst) const noexcept {
-  size_t n = len;
-  const char32_t *srcBeg = src;
-  const char *dstBeg = dst;
-  size_t vl8m4 = __riscv_vsetvlmax_e8m4();
-  vbool2_t m4mulp2 = __riscv_vmseq_vx_u8m4_b2(
-      __riscv_vand_vx_u8m4(__riscv_vid_v_u8m4(vl8m4), 3, vl8m4), 2, vl8m4);
-
-  for (size_t vl, vlOut; n > 0;) {
-    vl = __riscv_vsetvl_e32m4(n);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+}
 
-    vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t const *)src, vl);
-    vbool8_t m234 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x80 - 1, vl);
-    vuint16m2_t vn = __riscv_vncvt_x_x_w_u16m2(v, vl);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+    const char *input, size_t size, char16_t *utf16_output) const noexcept {
+  return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,
+                                                          utf16_output);
+}
 
-    if (__riscv_vfirst_m_b8(m234, vl) < 0) { /* 1 byte utf8 */
-      vlOut = vl;
-      __riscv_vse8_v_u8m1((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m1(vn, vlOut),
-                          vlOut);
-      n -= vl, src += vl, dst += vlOut;
-      continue;
-    }
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+    const char *input, size_t size, char16_t *utf16_output) const noexcept {
+  return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,
+                                                       utf16_output);
+}
 
-    vbool8_t m34 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x800 - 1, vl);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  utf8_to_utf32::validating_transcoder converter;
+  return converter.convert(buf, len, utf32_output);
+}
 
-    if (__riscv_vfirst_m_b8(m34, vl) < 0) { /* 1/2 byte utf8 */
-      /* 0: [     aaa|aabbbbbb]
-       * 1: [aabbbbbb|        ] vsll 8
-       * 2: [        |   aaaaa] vsrl 6
-       * 3: [00111111|00111111]
-       * 4: [  bbbbbb|000aaaaa] (1|2)&3
-       * 5: [10000000|11000000]
-       * 6: [10bbbbbb|110aaaaa] 4|5 */
-      vuint16m2_t twoByte = __riscv_vand_vx_u16m2(
-          __riscv_vor_vv_u16m2(__riscv_vsll_vx_u16m2(vn, 8, vl),
-                               __riscv_vsrl_vx_u16m2(vn, 6, vl), vl),
-          0b0011111100111111, vl);
-      vuint16m2_t vout16 =
-          __riscv_vor_vx_u16m2_mu(m234, vn, twoByte, 0b1000000011000000, vl);
-      vuint8m2_t vout = __riscv_vreinterpret_v_u16m2_u8m2(vout16);
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  utf8_to_utf32::validating_transcoder converter;
+  return converter.convert_with_errors(buf, len, utf32_output);
+}
 
-      /* Every high byte that is zero should be compressed
-       * low bytes should never be compressed, so we set them
-       * to all ones, and then create a non-zero bytes mask */
-      vbool4_t mcomp =
-          __riscv_vmsne_vx_u8m2_b4(__riscv_vreinterpret_v_u16m2_u8m2(
-                                       __riscv_vor_vx_u16m2(vout16, 0xFF, vl)),
-                                   0, vl * 2);
-      vlOut = __riscv_vcpop_m_b4(mcomp, vl * 2);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+    const char *input, size_t size, char32_t *utf32_output) const noexcept {
+  return utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
 
-      vout = __riscv_vcompress_vm_u8m2(vout, mcomp, vl * 2);
-      __riscv_vse8_v_u8m2((uint8_t *)dst, vout, vlOut);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      lsx_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
 
-      n -= vl, src += vl, dst += vlOut;
-      continue;
-    }
-    long idx1 =
-        __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl);
-    vbool8_t sur = __riscv_vmseq_vx_u32m4_b8(
-        __riscv_vand_vx_u32m4(v, 0xFFFFF800, vl), 0xD800, vl);
-    long idx2 = __riscv_vfirst_m_b8(sur, vl);
-    if (idx1 >= 0 && idx2 >= 0) {
-      if (idx1 <= idx2) {
-        return result(error_code::TOO_LARGE, src - srcBeg + idx1);
-      } else {
-        return result(error_code::SURROGATE, src - srcBeg + idx2);
-      }
-    }
-    if (idx1 >= 0) {
-      return result(error_code::TOO_LARGE, src - srcBeg + idx1);
-    }
-    if (idx2 >= 0) {
-      return result(error_code::SURROGATE, src - srcBeg + idx2);
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_latin1::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-    vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x10000 - 1, vl);
-    long first = __riscv_vfirst_m_b8(m4, vl);
-    size_t tail = vl - first;
-    vl = first < 0 ? vl : first;
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      lsx_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
 
-    if (vl > 0) { /* 1/2/3 byte utf8 */
-      /* vn: [aaaabbbb|bbcccccc]
-       * v1: [0bcccccc|        ] vsll  8
-       * v1: [10cccccc|        ] vsll  8 & 0b00111111 | 0b10000000
-       * v2: [        |110bbbbb] vsrl  6 & 0b00111111 | 0b11000000
-       * v2: [        |10bbbbbb] vsrl  6 & 0b00111111 | 0b10000000
-       * v3: [        |1110aaaa] vsrl 12 | 0b11100000
-       *  1: [00000000|0bcccccc|00000000|00000000] => [0bcccccc]
-       *  2: [00000000|10cccccc|110bbbbb|00000000] => [110bbbbb] [10cccccc]
-       *  3: [00000000|10cccccc|10bbbbbb|1110aaaa] => [1110aaaa] [10bbbbbb]
-       * [10cccccc]
-       */
-      vuint16m2_t v1, v2, v3, v12;
-      v1 = __riscv_vor_vx_u16m2_mu(
-          m234, vn, __riscv_vand_vx_u16m2(vn, 0b00111111, vl), 0b10000000, vl);
-      v1 = __riscv_vsll_vx_u16m2(v1, 8, vl);
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_latin1::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-      v2 = __riscv_vor_vx_u16m2(
-          __riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(vn, 6, vl), 0b00111111,
-                                vl),
-          0b10000000, vl);
-      v2 = __riscv_vor_vx_u16m2_mu(__riscv_vmnot_m_b8(m34, vl), v2, v2,
-                                   0b01000000, vl);
-      v3 = __riscv_vor_vx_u16m2(__riscv_vsrl_vx_u16m2(vn, 12, vl), 0b11100000,
-                                vl);
-      v12 = __riscv_vor_vv_u16m2_mu(m234, v1, v1, v2, vl);
+simdutf_warn_unused result
+implementation::convert_utf16le_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<result, char *> ret =
+      lsx_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
+          buf, len, latin1_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      latin1_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-      vuint32m4_t w12 = __riscv_vwmulu_vx_u32m4(v12, 1 << 8, vl);
-      vuint32m4_t w123 = __riscv_vwaddu_wv_u32m4_mu(m34, w12, w12, v3, vl);
-      vuint8m4_t vout = __riscv_vreinterpret_v_u32m4_u8m4(w123);
+simdutf_warn_unused result
+implementation::convert_utf16be_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<result, char *> ret =
+      lsx_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
+                                                               latin1_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      latin1_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-      vbool2_t mcomp = __riscv_vmor_mm_b2(
-          m4mulp2, __riscv_vmsne_vx_u8m4_b2(vout, 0, vl * 4), vl * 4);
-      vlOut = __riscv_vcpop_m_b2(mcomp, vl * 4);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  // optimization opportunity: implement a custom function.
+  return convert_utf16be_to_latin1(buf, len, latin1_output);
+}
 
-      vout = __riscv_vcompress_vm_u8m4(vout, mcomp, vl * 4);
-      __riscv_vse8_v_u8m4((uint8_t *)dst, vout, vlOut);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  // optimization opportunity: implement a custom function.
+  return convert_utf16le_to_latin1(buf, len, latin1_output);
+}
 
-      n -= vl, src += vl, dst += vlOut;
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      lsx_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf8::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
-
-    if (tail)
-      while (n) {
-        uint32_t word = src[0];
-        if (word < 0x10000)
-          break;
-        if (word > 0x10FFFF)
-          return result(error_code::TOO_LARGE, src - srcBeg);
-        *dst++ = (uint8_t)((word >> 18) | 0b11110000);
-        *dst++ = (uint8_t)(((word >> 12) & 0b111111) | 0b10000000);
-        *dst++ = (uint8_t)(((word >> 6) & 0b111111) | 0b10000000);
-        *dst++ = (uint8_t)((word & 0b111111) | 0b10000000);
-        ++src;
-        --n;
-      }
+    saved_bytes += scalar_saved_bytes;
   }
-
-  return result(error_code::SUCCESS, dst - dstBeg);
+  return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
-    const char32_t *src, size_t len, char *dst) const noexcept {
-  result res = convert_utf32_to_utf8_with_errors(src, len, dst);
-  return res.error == error_code::SUCCESS ? res.count : 0;
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      lsx_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf8::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
-    const char32_t *src, size_t len, char *dst) const noexcept {
-  return convert_utf32_to_utf8(src, len, dst);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char *> ret =
+      lsx_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len,
+                                                                utf8_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf8_output; // Set count to the number of 8-bit code units written
+  return ret.first;
 }
 
-template <simdutf_ByteFlip bflip>
-simdutf_really_inline static result
-rvv_convert_utf32_to_utf16_with_errors(const char32_t *src, size_t len,
-                                       char16_t *dst) {
-  size_t vl8m2 = __riscv_vsetvlmax_e8m2();
-  vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(
-      __riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);
-  const char16_t *dstBeg = dst;
-  const char32_t *srcBeg = src;
-  for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) {
-    vl = __riscv_vsetvl_e32m4(len);
-    vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t *)src, vl);
-    vuint32m4_t off = __riscv_vadd_vx_u32m4(v, 0xFFFF2000, vl);
-    long idx1 =
-        __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl);
-    long idx2 = __riscv_vfirst_m_b8(
-        __riscv_vmsgtu_vx_u32m4_b8(off, 0xFFFFF7FF, vl), vl);
-    if (idx1 >= 0 && idx2 >= 0) {
-      if (idx1 <= idx2)
-        return result(error_code::TOO_LARGE, src - srcBeg + idx1);
-      return result(error_code::SURROGATE, src - srcBeg + idx2);
-    }
-    if (idx1 >= 0)
-      return result(error_code::TOO_LARGE, src - srcBeg + idx1);
-    if (idx2 >= 0)
-      return result(error_code::SURROGATE, src - srcBeg + idx2);
-    long idx =
-        __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0xFFFF, vl), vl);
-    if (idx < 0) {
-      vlOut = vl;
-      vuint16m2_t n =
-          simdutf_byteflip<bflip>(__riscv_vncvt_x_x_w_u16m2(v, vlOut), vlOut);
-      __riscv_vse16_v_u16m2((uint16_t *)dst, n, vlOut);
-      continue;
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char *> ret =
+      lsx_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len,
+                                                             utf8_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
     }
-    vlOut = rvv_utf32_store_utf16_m4<bflip>((uint16_t *)dst, v, vl, m4even);
   }
-  return result(error_code::SUCCESS, dst - dstBeg);
+  ret.first.count =
+      ret.second -
+      utf8_output; // Set count to the number of 8-bit code units written
+  return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
-    const char32_t *src, size_t len, char16_t *dst) const noexcept {
-  result res = convert_utf32_to_utf16le_with_errors(src, len, dst);
-  return res.error == error_code::SUCCESS ? res.count : 0;
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return convert_utf16le_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
-    const char32_t *src, size_t len, char16_t *dst) const noexcept {
-  result res = convert_utf32_to_utf16be_with_errors(src, len, dst);
-  return res.error == error_code::SUCCESS ? res.count : 0;
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return convert_utf16be_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
-    const char32_t *src, size_t len, char16_t *dst) const noexcept {
-  return rvv_convert_utf32_to_utf16_with_errors<simdutf_ByteFlip::NONE>(
-      src, len, dst);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return 0;
+  }
+  std::pair<const char32_t *, char *> ret =
+      lsx_convert_utf32_to_utf8(buf, len, utf8_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
-    const char32_t *src, size_t len, char16_t *dst) const noexcept {
-  if (supports_zvbb())
-    return rvv_convert_utf32_to_utf16_with_errors<simdutf_ByteFlip::ZVBB>(
-        src, len, dst);
-  else
-    return rvv_convert_utf32_to_utf16_with_errors<simdutf_ByteFlip::V>(src, len,
-                                                                       dst);
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char *> ret =
+      lsx_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+  if (ret.first.count != len) {
+    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf8_output; // Set count to the number of 8-bit code units written
+  return ret.first;
 }
 
-template <simdutf_ByteFlip bflip>
-simdutf_really_inline static size_t
-rvv_convert_valid_utf32_to_utf16(const char32_t *src, size_t len,
-                                 char16_t *dst) {
-  size_t vl8m2 = __riscv_vsetvlmax_e8m2();
-  vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(
-      __riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);
-  char16_t *dstBeg = dst;
-  for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) {
-    vl = __riscv_vsetvl_e32m4(len);
-    vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t *)src, vl);
-    if (__riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0xFFFF, vl), vl) <
-        0) {
-      vlOut = vl;
-      vuint16m2_t n =
-          simdutf_byteflip<bflip>(__riscv_vncvt_x_x_w_u16m2(v, vlOut), vlOut);
-      __riscv_vse16_v_u16m2((uint16_t *)dst, n, vlOut);
-      continue;
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::pair<const char16_t *, char32_t *> ret =
+      lsx_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
-    vlOut = rvv_utf32_store_utf16_m4<bflip>((uint16_t *)dst, v, vl, m4even);
+    saved_bytes += scalar_saved_bytes;
   }
-  return dst - dstBeg;
+  return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
-    const char32_t *src, size_t len, char16_t *dst) const noexcept {
-  return rvv_convert_valid_utf32_to_utf16<simdutf_ByteFlip::NONE>(src, len,
-                                                                  dst);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::pair<const char16_t *, char32_t *> ret =
+      lsx_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf32::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
-    const char32_t *src, size_t len, char16_t *dst) const noexcept {
-  if (supports_zvbb())
-    return rvv_convert_valid_utf32_to_utf16<simdutf_ByteFlip::ZVBB>(src, len,
-                                                                    dst);
-  else
-    return rvv_convert_valid_utf32_to_utf16<simdutf_ByteFlip::V>(src, len, dst);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char32_t *> ret =
+      lsx_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len,
+                                                                 utf32_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf32_output; // Set count to the number of 8-bit code units written
+  return ret.first;
 }
-/* end file src/rvv/rvv_utf32_to.inl.cpp */
-/* begin file src/rvv/rvv_utf8_to.inl.cpp */
-template <typename Tdst, simdutf_ByteFlip bflip, bool validate = true>
-simdutf_really_inline static size_t rvv_utf8_to_common(char const *src,
-                                                       size_t len, Tdst *dst) {
-  static_assert(std::is_same<Tdst, uint16_t>() ||
-                    std::is_same<Tdst, uint32_t>(),
-                "invalid type");
-  constexpr bool is16 = std::is_same<Tdst, uint16_t>();
-  constexpr endianness endian =
-      bflip == simdutf_ByteFlip::NONE ? endianness::LITTLE : endianness::BIG;
-  const auto scalar = [](char const *in, size_t count, Tdst *out) {
-    return is16 ? scalar::utf8_to_utf16::convert<endian>(in, count,
-                                                         (char16_t *)out)
-                : scalar::utf8_to_utf32::convert(in, count, (char32_t *)out);
-  };
 
-  if (len < 32)
-    return scalar(src, len, dst);
-
-  /* validate first three bytes */
-  if (validate) {
-    size_t idx = 3;
-    while (idx < len && (src[idx] >> 6) == 0b10)
-      ++idx;
-    if (idx > 3 + 3 || !scalar::utf8::validate(src, idx))
-      return 0;
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char32_t *> ret =
+      lsx_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len,
+                                                              utf32_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
   }
+  ret.first.count =
+      ret.second -
+      utf32_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-  size_t tail = 3;
-  size_t n = len - tail;
-  Tdst *beg = dst;
-
-  static const uint64_t err1m[] = {0x0202020202020202, 0x4915012180808080};
-  static const uint64_t err2m[] = {0xCBCBCB8B8383A3E7, 0xCBCBDBCBCBCBCBCB};
-  static const uint64_t err3m[] = {0x0101010101010101, 0X01010101BABAAEE6};
-
-  const vuint8m1_t err1tbl =
-      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2));
-  const vuint8m1_t err2tbl =
-      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2));
-  const vuint8m1_t err3tbl =
-      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2));
-
-  size_t vl8m2 = __riscv_vsetvlmax_e8m2();
-  vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(
-      __riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);
-
-  for (size_t vl, vlOut; n > 0; n -= vl, src += vl, dst += vlOut) {
-    vl = __riscv_vsetvl_e8m2(n);
-
-    vuint8m2_t v0 = __riscv_vle8_v_u8m2((uint8_t const *)src, vl);
-    uint64_t max = __riscv_vmv_x_s_u8m1_u8(
-        __riscv_vredmaxu_vs_u8m2_u8m1(v0, __riscv_vmv_s_x_u8m1(0, vl), vl));
-
-    uint8_t next0 = src[vl + 0];
-    uint8_t next1 = src[vl + 1];
-    uint8_t next2 = src[vl + 2];
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char32_t *, char *> ret =
+      lsx_convert_utf32_to_latin1(buf, len, latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
 
-    /* fast path: ASCII */
-    if ((max | next0 | next1 | next2) < 0b10000000) {
-      vlOut = vl;
-      if (is16)
-        __riscv_vse16_v_u16m4(
-            (uint16_t *)dst,
-            simdutf_byteflip<bflip>(__riscv_vzext_vf2_u16m4(v0, vlOut), vlOut),
-            vlOut);
-      else
-        __riscv_vse32_v_u32m8((uint32_t *)dst,
-                              __riscv_vzext_vf4_u32m8(v0, vlOut), vlOut);
-      continue;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-    /* see "Validating UTF-8 In Less Than One Instruction Per Byte"
-     * https://arxiv.org/abs/2010.03090 */
-    vuint8m2_t v1 = __riscv_vslide1down_vx_u8m2(v0, next0, vl);
-    vuint8m2_t v2 = __riscv_vslide1down_vx_u8m2(v1, next1, vl);
-    vuint8m2_t v3 = __riscv_vslide1down_vx_u8m2(v2, next2, vl);
-
-    if (validate) {
-      vuint8m2_t s1 = __riscv_vreinterpret_v_u16m2_u8m2(__riscv_vsrl_vx_u16m2(
-          __riscv_vreinterpret_v_u8m2_u16m2(v2), 4, __riscv_vsetvlmax_e16m2()));
-      vuint8m2_t s3 = __riscv_vreinterpret_v_u16m2_u8m2(__riscv_vsrl_vx_u16m2(
-          __riscv_vreinterpret_v_u8m2_u16m2(v3), 4, __riscv_vsetvlmax_e16m2()));
-
-      vuint8m2_t idx2 = __riscv_vand_vx_u8m2(v2, 0xF, vl);
-      vuint8m2_t idx1 = __riscv_vand_vx_u8m2(s1, 0xF, vl);
-      vuint8m2_t idx3 = __riscv_vand_vx_u8m2(s3, 0xF, vl);
-
-      vuint8m2_t err1 = simdutf_vrgather_u8m1x2(err1tbl, idx1);
-      vuint8m2_t err2 = simdutf_vrgather_u8m1x2(err2tbl, idx2);
-      vuint8m2_t err3 = simdutf_vrgather_u8m1x2(err3tbl, idx3);
-      vint8m2_t errs = __riscv_vreinterpret_v_u8m2_i8m2(
-          __riscv_vand_vv_u8m2(__riscv_vand_vv_u8m2(err1, err2, vl), err3, vl));
-
-      vbool4_t is_3 = __riscv_vmsgtu_vx_u8m2_b4(v1, 0b11100000 - 1, vl);
-      vbool4_t is_4 = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b11110000 - 1, vl);
-      vbool4_t is_34 = __riscv_vmor_mm_b4(is_3, is_4, vl);
-      vbool4_t err34 =
-          __riscv_vmxor_mm_b4(is_34, __riscv_vmslt_vx_i8m2_b4(errs, 0, vl), vl);
-      vbool4_t errm =
-          __riscv_vmor_mm_b4(__riscv_vmsgt_vx_i8m2_b4(errs, 0, vl), err34, vl);
-      if (__riscv_vfirst_m_b4(errm, vl) >= 0)
-        return 0;
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<result, char *> ret =
+      lsx_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
+        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
     }
+  }
+  ret.first.count =
+      ret.second -
+      latin1_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-    /* decoding */
-
-    /* mask of non continuation bytes */
-    vbool4_t m =
-        __riscv_vmsgt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v0), -65, vl);
-    vlOut = __riscv_vcpop_m_b4(m, vl);
-
-    /* extract first and second bytes */
-    vuint8m2_t b1 = __riscv_vcompress_vm_u8m2(v0, m, vl);
-    vuint8m2_t b2 = __riscv_vcompress_vm_u8m2(v1, m, vl);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char32_t *, char *> ret =
+      lsx_convert_utf32_to_latin1(buf, len, latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
 
-    /* fast path: one and two byte */
-    if (max < 0b11100000) {
-      b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut);
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert_valid(
+        ret.first, len - (ret.first - buf), ret.second);
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-      vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut);
-      b1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  // optimization opportunity: implement a custom function.
+  return convert_utf32_to_utf8(buf, len, utf8_output);
+}
 
-      vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4(
-          b1,
-          __riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1 << 6, m1,
-                                  vlOut),
-          vlOut);
-      b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut);
-      if (is16)
-        __riscv_vse16_v_u16m4((uint16_t *)dst,
-                              simdutf_byteflip<bflip>(b12, vlOut), vlOut);
-      else
-        __riscv_vse32_v_u32m8((uint32_t *)dst,
-                              __riscv_vzext_vf2_u32m8(b12, vlOut), vlOut);
-      continue;
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char32_t *, char16_t *> ret =
+      lsx_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
+    saved_bytes += scalar_saved_bytes;
+  }
 
-    /* fast path: one, two and three byte */
-    if (max < 0b11110000) {
-      vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl);
-
-      b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut);
-      b3 = __riscv_vand_vx_u8m2(b3, 0b00111111, vlOut);
-
-      vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut);
-      vbool4_t m3 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b11011111, vlOut);
-
-      vuint8m2_t t1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut);
-      b1 = __riscv_vand_vx_u8m2_mu(m3, t1, b1, 15, vlOut);
+  return saved_bytes;
+}
 
-      vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4(
-          b1,
-          __riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1 << 6, m1,
-                                  vlOut),
-          vlOut);
-      b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut);
-      vuint16m4_t b123 = __riscv_vwaddu_wv_u16m4_mu(
-          m3, b12, __riscv_vsll_vx_u16m4_mu(m3, b12, b12, 6, vlOut), b3, vlOut);
-      if (is16)
-        __riscv_vse16_v_u16m4((uint16_t *)dst,
-                              simdutf_byteflip<bflip>(b123, vlOut), vlOut);
-      else
-        __riscv_vse32_v_u32m8((uint32_t *)dst,
-                              __riscv_vzext_vf2_u32m8(b123, vlOut), vlOut);
-      continue;
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char32_t *, char16_t *> ret =
+      lsx_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf32_to_utf16::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-    /* extract third and fourth bytes */
-    vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl);
-    vuint8m2_t b4 = __riscv_vcompress_vm_u8m2(v3, m, vl);
-
-    /* remove prefix from leading bytes
-     *
-     * We could also use vrgather here, but it increases register pressure,
-     * and its performance varies widely on current platforms. It might be
-     * worth reconsidering, though, once there is more hardware available.
-     * Same goes for the __riscv_vsrl_vv_u32m4 correction step.
-     *
-     * We shift left and then right by the number of bytes in the prefix,
-     * which can be calculated as follows:
-     *         x                                max(x-10, 0)
-     * 0xxx -> 0000-0111 -> sift by 0 or 1   -> 0
-     * 10xx -> 1000-1011 -> don't care
-     * 110x -> 1100,1101 -> sift by 3        -> 2,3
-     * 1110 -> 1110      -> sift by 4        -> 4
-     * 1111 -> 1111      -> sift by 5        -> 5
-     *
-     * vssubu.vx v, 10, (max(x-10, 0)) almost gives us what we want, we
-     * just need to manually detect and handle the one special case:
-     */
-#define SIMDUTF_RVV_UTF8_TO_COMMON_M1(idx)                                     \
-  vuint8m1_t c1 = __riscv_vget_v_u8m2_u8m1(b1, idx);                           \
-  vuint8m1_t c2 = __riscv_vget_v_u8m2_u8m1(b2, idx);                           \
-  vuint8m1_t c3 = __riscv_vget_v_u8m2_u8m1(b3, idx);                           \
-  vuint8m1_t c4 = __riscv_vget_v_u8m2_u8m1(b4, idx);                           \
-  /* remove prefix from trailing bytes */                                      \
-  c2 = __riscv_vand_vx_u8m1(c2, 0b00111111, vlOut);                            \
-  c3 = __riscv_vand_vx_u8m1(c3, 0b00111111, vlOut);                            \
-  c4 = __riscv_vand_vx_u8m1(c4, 0b00111111, vlOut);                            \
-  vuint8m1_t shift = __riscv_vsrl_vx_u8m1(c1, 4, vlOut);                       \
-  shift = __riscv_vmerge_vxm_u8m1(__riscv_vssubu_vx_u8m1(shift, 10, vlOut), 3, \
-                                  __riscv_vmseq_vx_u8m1_b8(shift, 12, vlOut),  \
-                                  vlOut);                                      \
-  c1 = __riscv_vsll_vv_u8m1(c1, shift, vlOut);                                 \
-  c1 = __riscv_vsrl_vv_u8m1(c1, shift, vlOut);                                 \
-  /* unconditionally widen and combine to c1234 */                             \
-  vuint16m2_t c34 = __riscv_vwaddu_wv_u16m2(                                   \
-      __riscv_vwmulu_vx_u16m2(c3, 1 << 6, vlOut), c4, vlOut);                  \
-  vuint16m2_t c12 = __riscv_vwaddu_wv_u16m2(                                   \
-      __riscv_vwmulu_vx_u16m2(c1, 1 << 6, vlOut), c2, vlOut);                  \
-  vuint32m4_t c1234 = __riscv_vwaddu_wv_u32m4(                                 \
-      __riscv_vwmulu_vx_u32m4(c12, 1 << 12, vlOut), c34, vlOut);               \
-  /* derive required right-shift amount from `shift` to reduce                 \
-   * c1234 to the required number of bytes */                                  \
-  c1234 = __riscv_vsrl_vv_u32m4(                                               \
-      c1234,                                                                   \
-      __riscv_vzext_vf4_u32m4(                                                 \
-          __riscv_vmul_vx_u8m1(                                                \
-              __riscv_vrsub_vx_u8m1(__riscv_vssubu_vx_u8m1(shift, 2, vlOut),   \
-                                    3, vlOut),                                 \
-              6, vlOut),                                                       \
-          vlOut),                                                              \
-      vlOut);                                                                  \
-  /* store result in desired format */                                         \
-  if (is16)                                                                    \
-    vlDst = rvv_utf32_store_utf16_m4<bflip>((uint16_t *)dst, c1234, vlOut,     \
-                                            m4even);                           \
-  else                                                                         \
-    vlDst = vlOut, __riscv_vse32_v_u32m4((uint32_t *)dst, c1234, vlOut);
-
-    /* Unrolling this manually reduces register pressure and allows
-     * us to terminate early. */
-    {
-      size_t vlOutm2 = vlOut, vlDst;
-      vlOut = __riscv_vsetvl_e8m1(vlOut);
-      SIMDUTF_RVV_UTF8_TO_COMMON_M1(0)
-      if (vlOutm2 == vlOut) {
-        vlOut = vlDst;
-        continue;
-      }
-
-      dst += vlDst;
-      vlOut = vlOutm2 - vlOut;
-    }
-    {
-      size_t vlDst;
-      SIMDUTF_RVV_UTF8_TO_COMMON_M1(1)
-      vlOut = vlDst;
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char16_t *> ret =
+      lsx_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len,
+                                                                 utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res =
+        scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
     }
-
-#undef SIMDUTF_RVV_UTF8_TO_COMMON_M1
   }
+  ret.first.count =
+      ret.second -
+      utf16_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-  /* validate the last character and reparse it + tail */
-  if (len > tail) {
-    if ((src[0] >> 6) == 0b10)
-      --dst;
-    while ((src[0] >> 6) == 0b10 && tail < len)
-      --src, ++tail;
-    if (is16) {
-      /* go back one more, when on high surrogate */
-      if (simdutf_byteflip<bflip>((uint16_t)dst[-1]) >= 0xD800 &&
-          simdutf_byteflip<bflip>((uint16_t)dst[-1]) <= 0xDBFF)
-        --dst;
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char16_t *> ret =
+      lsx_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len,
+                                                              utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res =
+        scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
     }
   }
-  size_t ret = scalar(src, tail, dst);
-  if (ret == 0)
-    return 0;
-  return (size_t)(dst - beg) + ret;
+  ret.first.count =
+      ret.second -
+      utf16_output; // Set count to the number of 8-bit code units written
+  return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
-    const char *src, size_t len, char *dst) const noexcept {
-  const char *beg = dst;
-  uint8_t last = 0;
-  for (size_t vl, vlOut; len > 0;
-       len -= vl, src += vl, dst += vlOut, last = src[-1]) {
-    vl = __riscv_vsetvl_e8m2(len);
-    vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
-    // check which bytes are ASCII
-    vbool4_t ascii = __riscv_vmsltu_vx_u8m2_b4(v1, 0b10000000, vl);
-    // count ASCII bytes
-    vlOut = __riscv_vcpop_m_b4(ascii, vl);
-    // The original code would only enter the next block after this check:
-    //   vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
-    //   vlOut = __riscv_vcpop_m_b4(m, vl);
-    //   if (vlOut != vl || last > 0b01111111) {...}q
-    // So that everything is ASCII or continuation bytes, we just proceeded
-    // without any processing, going straight to __riscv_vse8_v_u8m2.
-    // But you need the __riscv_vslide1up_vx_u8m2 whenever there is a non-ASCII
-    // byte.
-    if (vlOut != vl) { // If not pure ASCII
-      // Non-ASCII characters
-      // We now want to mark the ascii and continuation bytes
-      vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
-      // We count them, that's our new vlOut (output vector length)
-      vlOut = __riscv_vcpop_m_b4(m, vl);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return convert_utf32_to_utf16le(buf, len, utf16_output);
+}
 
-      vuint8m2_t v0 = __riscv_vslide1up_vx_u8m2(v1, last, vl);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return convert_utf32_to_utf16be(buf, len, utf16_output);
+}
 
-      vbool4_t leading0 = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b10111111, vl);
-      vbool4_t trailing1 = __riscv_vmslt_vx_i8m2_b4(
-          __riscv_vreinterpret_v_u8m2_i8m2(v1), (uint8_t)0b11000000, vl);
-      // -62 i 0b11000010, so we check whether any of v0 is too big
-      vbool4_t tobig = __riscv_vmand_mm_b4(
-          leading0,
-          __riscv_vmsgtu_vx_u8m2_b4(__riscv_vxor_vx_u8m2(v0, (uint8_t)-62, vl),
-                                    1, vl),
-          vl);
-      if (__riscv_vfirst_m_b4(
-              __riscv_vmor_mm_b4(
-                  tobig, __riscv_vmxor_mm_b4(leading0, trailing1, vl), vl),
-              vl) >= 0)
-        return 0;
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return convert_utf16le_to_utf32(buf, len, utf32_output);
+}
 
-      v1 = __riscv_vor_vx_u8m2_mu(__riscv_vmseq_vx_u8m2_b4(v0, 0b11000011, vl),
-                                  v1, v1, 0b01000000, vl);
-      v1 = __riscv_vcompress_vm_u8m2(v1, m, vl);
-    } else if (last >= 0b11000000) { // If last byte is a leading  byte and we
-                                     // got only ASCII, error!
-      return 0;
-    }
-    __riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut);
-  }
-  if (last > 0b10111111)
-    return 0;
-  return dst - beg;
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return convert_utf16be_to_utf32(buf, len, utf32_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
-    const char *src, size_t len, char *dst) const noexcept {
-  size_t res = convert_utf8_to_latin1(src, len, dst);
-  if (res)
-    return result(error_code::SUCCESS, res);
-  return scalar::utf8_to_latin1::convert_with_errors(src, len, dst);
+void implementation::change_endianness_utf16(const char16_t *input,
+                                             size_t length,
+                                             char16_t *output) const noexcept {
+  utf16::change_endianness_utf16(input, length, output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
-    const char *src, size_t len, char *dst) const noexcept {
-  const char *beg = dst;
-  uint8_t last = 0;
-  for (size_t vl, vlOut; len > 0;
-       len -= vl, src += vl, dst += vlOut, last = src[-1]) {
-    vl = __riscv_vsetvl_e8m2(len);
-    vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
-    vbool4_t ascii = __riscv_vmsltu_vx_u8m2_b4(v1, 0b10000000, vl);
-    vlOut = __riscv_vcpop_m_b4(ascii, vl);
-    if (vlOut != vl) { // If not pure ASCII
-      vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
-      vlOut = __riscv_vcpop_m_b4(m, vl);
-      vuint8m2_t v0 = __riscv_vslide1up_vx_u8m2(v1, last, vl);
-      v1 = __riscv_vor_vx_u8m2_mu(__riscv_vmseq_vx_u8m2_b4(v0, 0b11000011, vl),
-                                  v1, v1, 0b01000000, vl);
-      v1 = __riscv_vcompress_vm_u8m2(v1, m, vl);
-    }
-    __riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut);
-  }
-  return dst - beg;
+simdutf_warn_unused size_t implementation::count_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::count_code_points<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
-    const char *src, size_t len, char16_t *dst) const noexcept {
-  return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::NONE>(src, len,
-                                                              (uint16_t *)dst);
+simdutf_warn_unused size_t implementation::count_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::count_code_points<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
-    const char *src, size_t len, char16_t *dst) const noexcept {
-  if (supports_zvbb())
-    return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::ZVBB>(
-        src, len, (uint16_t *)dst);
-  else
-    return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::V>(src, len,
-                                                             (uint16_t *)dst);
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *input, size_t length) const noexcept {
+  return utf8::count_code_points(input, length);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
-    const char *src, size_t len, char16_t *dst) const noexcept {
-  size_t res = convert_utf8_to_utf16le(src, len, dst);
-  if (res)
-    return result(error_code::SUCCESS, res);
-  return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(
-      src, len, dst);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
+    const char *buf, size_t len) const noexcept {
+  return count_utf8(buf, len);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
-    const char *src, size_t len, char16_t *dst) const noexcept {
-  size_t res = convert_utf8_to_utf16be(src, len, dst);
-  if (res)
-    return result(error_code::SUCCESS, res);
-  return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(src, len,
-                                                                     dst);
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf16(size_t length) const noexcept {
+  return length;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
-    const char *src, size_t len, char16_t *dst) const noexcept {
-  return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::NONE, false>(
-      src, len, (uint16_t *)dst);
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf32(size_t length) const noexcept {
+  return length;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
-    const char *src, size_t len, char16_t *dst) const noexcept {
-  if (supports_zvbb())
-    return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::ZVBB, false>(
-        src, len, (uint16_t *)dst);
-  else
-    return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::V, false>(
-        src, len, (uint16_t *)dst);
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+    const char *input, size_t length) const noexcept {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
+  const uint8_t *data_end = data + length;
+  uint64_t result = 0;
+  while (data + 16 < data_end) {
+    uint64_t two_bytes = 0;
+    __m128i input_vec = __lsx_vld(data, 0);
+    two_bytes =
+        __lsx_vpickve2gr_hu(__lsx_vpcnt_h(__lsx_vmskltz_b(input_vec)), 0);
+    result += 16 + two_bytes;
+    data += 16;
+  }
+  return result + scalar::latin1::utf8_length_from_latin1((const char *)data,
+                                                          data_end - data);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
-    const char *src, size_t len, char32_t *dst) const noexcept {
-  return rvv_utf8_to_common<uint32_t, simdutf_ByteFlip::NONE>(src, len,
-                                                              (uint32_t *)dst);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
-    const char *src, size_t len, char32_t *dst) const noexcept {
-  size_t res = convert_utf8_to_utf32(src, len, dst);
-  if (res)
-    return result(error_code::SUCCESS, res);
-  return scalar::utf8_to_utf32::convert_with_errors(src, len, dst);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
-    const char *src, size_t len, char32_t *dst) const noexcept {
-  return rvv_utf8_to_common<uint32_t, simdutf_ByteFlip::NONE, false>(
-      src, len, (uint32_t *)dst);
+simdutf_warn_unused size_t
+implementation::utf16_length_from_latin1(size_t length) const noexcept {
+  return length;
 }
-/* end file src/rvv/rvv_utf8_to.inl.cpp */
 
-simdutf_warn_unused int
-implementation::detect_encodings(const char *input,
-                                 size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  if (bom_encoding != encoding_type::unspecified)
-    return bom_encoding;
-  // todo: reimplement as a one-pass algorithm.
-  int out = 0;
-  if (validate_utf8(input, length))
-    out |= encoding_type::UTF8;
-  if (length % 2 == 0) {
-    if (validate_utf16(reinterpret_cast<const char16_t *>(input), length / 2))
-      out |= encoding_type::UTF16_LE;
-  }
-  if (length % 4 == 0) {
-    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4))
-      out |= encoding_type::UTF32_LE;
-  }
+simdutf_warn_unused size_t
+implementation::utf32_length_from_latin1(size_t length) const noexcept {
+  return length;
+}
 
-  return out;
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
-template <simdutf_ByteFlip bflip>
-simdutf_really_inline static void
-rvv_change_endianness_utf16(const char16_t *src, size_t len, char16_t *dst) {
-  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
-    vl = __riscv_vsetvl_e16m8(len);
-    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
-    __riscv_vse16_v_u16m8((uint16_t *)dst, simdutf_byteflip<bflip>(v, vl), vl);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return utf8::utf16_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  const __m128i v_80 = __lsx_vrepli_w(0x80); /*0x00000080*/
+  const __m128i v_800 = __lsx_vldi(-3832);   /*0x00000800*/
+  const __m128i v_10000 = __lsx_vldi(-3583); /*0x00010000*/
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos + 4 <= length; pos += 4) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input + pos), 0);
+    const __m128i ascii_bytes_bytemask = __lsx_vslt_w(in, v_80);
+    const __m128i one_two_bytes_bytemask = __lsx_vslt_w(in, v_800);
+    const __m128i two_bytes_bytemask =
+        __lsx_vxor_v(one_two_bytes_bytemask, ascii_bytes_bytemask);
+    const __m128i three_bytes_bytemask =
+        __lsx_vxor_v(__lsx_vslt_w(in, v_10000), one_two_bytes_bytemask);
+
+    const uint32_t ascii_bytes_count = __lsx_vpickve2gr_bu(
+        __lsx_vpcnt_b(__lsx_vmskltz_w(ascii_bytes_bytemask)), 0);
+    const uint32_t two_bytes_count = __lsx_vpickve2gr_bu(
+        __lsx_vpcnt_b(__lsx_vmskltz_w(two_bytes_bytemask)), 0);
+    const uint32_t three_bytes_count = __lsx_vpickve2gr_bu(
+        __lsx_vpcnt_b(__lsx_vmskltz_w(three_bytes_bytemask)), 0);
+
+    count +=
+        16 - 3 * ascii_bytes_count - 2 * two_bytes_count - three_bytes_count;
+  }
+  return count +
+         scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  const __m128i v_ffff = __lsx_vldi(-2304); /*0x0000ffff*/
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos + 4 <= length; pos += 4) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input + pos), 0);
+    const __m128i surrogate_bytemask = __lsx_vslt_wu(v_ffff, in);
+    size_t surrogate_count = __lsx_vpickve2gr_bu(
+        __lsx_vpcnt_b(__lsx_vmskltz_w(surrogate_bytemask)), 0);
+    count += 4 + surrogate_count;
   }
+  return count +
+         scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
 }
 
-void implementation::change_endianness_utf16(const char16_t *src, size_t len,
-                                             char16_t *dst) const noexcept {
-  if (supports_zvbb())
-    return rvv_change_endianness_utf16<simdutf_ByteFlip::ZVBB>(src, len, dst);
-  else
-    return rvv_change_endianness_utf16<simdutf_ByteFlip::V>(src, len, dst);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return utf8::count_code_points(input, length);
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
@@ -35002,86 +49864,21 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
 simdutf_warn_unused result implementation::base64_to_binary(
     const char *input, size_t length, char *output, base64_options options,
     last_chunk_handling_options last_chunk_options) const noexcept {
-  while (length > 0 &&
-         scalar::base64::is_ascii_white_space(input[length - 1])) {
-    length--;
-  }
-  size_t equallocation =
-      length; // location of the first padding character if any
-  size_t equalsigns = 0;
-  if (length > 0 && input[length - 1] == '=') {
-    equallocation = length - 1;
-    length -= 1;
-    equalsigns++;
-    while (length > 0 &&
-           scalar::base64::is_ascii_white_space(input[length - 1])) {
-      length--;
-    }
-    if (length > 0 && input[length - 1] == '=') {
-      equallocation = length - 1;
-      equalsigns++;
-      length -= 1;
-    }
-  }
-  if (length == 0) {
-    if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
-    }
-    return {SUCCESS, 0};
-  }
-  result r = scalar::base64::base64_tail_decode(
-      output, input, length, equalsigns, options, last_chunk_options);
-  if (last_chunk_options != stop_before_partial &&
-      r.error == error_code::SUCCESS && equalsigns > 0) {
-    // additional checks
-    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
-    }
-  }
-  return r;
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
 }
 
 simdutf_warn_unused full_result implementation::base64_to_binary_details(
     const char *input, size_t length, char *output, base64_options options,
     last_chunk_handling_options last_chunk_options) const noexcept {
-  while (length > 0 &&
-         scalar::base64::is_ascii_white_space(input[length - 1])) {
-    length--;
-  }
-  size_t equallocation =
-      length; // location of the first padding character if any
-  size_t equalsigns = 0;
-  if (length > 0 && input[length - 1] == '=') {
-    equallocation = length - 1;
-    length -= 1;
-    equalsigns++;
-    while (length > 0 &&
-           scalar::base64::is_ascii_white_space(input[length - 1])) {
-      length--;
-    }
-    if (length > 0 && input[length - 1] == '=') {
-      equallocation = length - 1;
-      equalsigns++;
-      length -= 1;
-    }
-  }
-  if (length == 0) {
-    if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation, 0};
-    }
-    return {SUCCESS, 0, 0};
-  }
-  full_result r = scalar::base64::base64_tail_decode(
-      output, input, length, equalsigns, options, last_chunk_options);
-  if (last_chunk_options != stop_before_partial &&
-      r.error == error_code::SUCCESS && equalsigns > 0) {
-    // additional checks
-    if ((r.output_count % 3 == 0) ||
-        ((r.output_count % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
-    }
-  }
-  return r;
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
@@ -35092,86 +49889,21 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
 simdutf_warn_unused result implementation::base64_to_binary(
     const char16_t *input, size_t length, char *output, base64_options options,
     last_chunk_handling_options last_chunk_options) const noexcept {
-  while (length > 0 &&
-         scalar::base64::is_ascii_white_space(input[length - 1])) {
-    length--;
-  }
-  size_t equallocation =
-      length; // location of the first padding character if any
-  auto equalsigns = 0;
-  if (length > 0 && input[length - 1] == '=') {
-    equallocation = length - 1;
-    length -= 1;
-    equalsigns++;
-    while (length > 0 &&
-           scalar::base64::is_ascii_white_space(input[length - 1])) {
-      length--;
-    }
-    if (length > 0 && input[length - 1] == '=') {
-      equallocation = length - 1;
-      equalsigns++;
-      length -= 1;
-    }
-  }
-  if (length == 0) {
-    if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
-    }
-    return {SUCCESS, 0};
-  }
-  result r = scalar::base64::base64_tail_decode(
-      output, input, length, equalsigns, options, last_chunk_options);
-  if (last_chunk_options != stop_before_partial &&
-      r.error == error_code::SUCCESS && equalsigns > 0) {
-    // additional checks
-    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
-    }
-  }
-  return r;
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
 }
 
 simdutf_warn_unused full_result implementation::base64_to_binary_details(
     const char16_t *input, size_t length, char *output, base64_options options,
     last_chunk_handling_options last_chunk_options) const noexcept {
-  while (length > 0 &&
-         scalar::base64::is_ascii_white_space(input[length - 1])) {
-    length--;
-  }
-  size_t equallocation =
-      length; // location of the first padding character if any
-  size_t equalsigns = 0;
-  if (length > 0 && input[length - 1] == '=') {
-    equallocation = length - 1;
-    length -= 1;
-    equalsigns++;
-    while (length > 0 &&
-           scalar::base64::is_ascii_white_space(input[length - 1])) {
-      length--;
-    }
-    if (length > 0 && input[length - 1] == '=') {
-      equallocation = length - 1;
-      equalsigns++;
-      length -= 1;
-    }
-  }
-  if (length == 0) {
-    if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation, 0};
-    }
-    return {SUCCESS, 0, 0};
-  }
-  full_result r = scalar::base64::base64_tail_decode(
-      output, input, length, equalsigns, options, last_chunk_options);
-  if (last_chunk_options != stop_before_partial &&
-      r.error == error_code::SUCCESS && equalsigns > 0) {
-    // additional checks
-    if ((r.output_count % 3 == 0) ||
-        ((r.output_count % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
-    }
-  }
-  return r;
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(
@@ -35182,148 +49914,148 @@ simdutf_warn_unused size_t implementation::base64_length_from_binary(
 size_t implementation::binary_to_base64(const char *input, size_t length,
                                         char *output,
                                         base64_options options) const noexcept {
-  return scalar::base64::tail_encode_base64(output, input, length, options);
+  if (options & base64_url) {
+    return encode_base64<true>(output, input, length, options);
+  } else {
+    return encode_base64<false>(output, input, length, options);
+  }
 }
-} // namespace rvv
+} // namespace lsx
 } // namespace simdutf
 
-/* begin file src/simdutf/rvv/end.h */
-#if SIMDUTF_CAN_ALWAYS_RUN_RVV
-// nothing needed.
-#else
-SIMDUTF_UNTARGET_REGION
-#endif
-
-/* end file src/simdutf/rvv/end.h */
-/* end file src/rvv/implementation.cpp */
-#endif
-#if SIMDUTF_IMPLEMENTATION_WESTMERE
-/* begin file src/westmere/implementation.cpp */
-/* begin file src/simdutf/westmere/begin.h */
-// redefining SIMDUTF_IMPLEMENTATION to "westmere"
-// #define SIMDUTF_IMPLEMENTATION westmere
-
-#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
-// nothing needed.
-#else
-SIMDUTF_TARGET_WESTMERE
+/* begin file src/simdutf/lsx/end.h */
+/* end file src/simdutf/lsx/end.h */
+/* end file src/lsx/implementation.cpp */
 #endif
-/* end file src/simdutf/westmere/begin.h */
+#if SIMDUTF_IMPLEMENTATION_LASX
+/* begin file src/lasx/implementation.cpp */
+/* begin file src/simdutf/lasx/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "lasx"
+// #define SIMDUTF_IMPLEMENTATION lasx
+/* end file src/simdutf/lasx/begin.h */
 namespace simdutf {
-namespace westmere {
+namespace lasx {
 namespace {
-#ifndef SIMDUTF_WESTMERE_H
-  #error "westmere.h must be included"
+#ifndef SIMDUTF_LASX_H
+  #error "lasx.h must be included"
 #endif
 using namespace simd;
 
+// convert vmskltz/vmskgez/vmsknz to
+// simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes index
+const uint8_t lasx_1_2_utf8_bytes_mask[] = {
+    0,   1,   4,   5,   16,  17,  20,  21,  64,  65,  68,  69,  80,  81,  84,
+    85,  2,   3,   6,   7,   18,  19,  22,  23,  66,  67,  70,  71,  82,  83,
+    86,  87,  8,   9,   12,  13,  24,  25,  28,  29,  72,  73,  76,  77,  88,
+    89,  92,  93,  10,  11,  14,  15,  26,  27,  30,  31,  74,  75,  78,  79,
+    90,  91,  94,  95,  32,  33,  36,  37,  48,  49,  52,  53,  96,  97,  100,
+    101, 112, 113, 116, 117, 34,  35,  38,  39,  50,  51,  54,  55,  98,  99,
+    102, 103, 114, 115, 118, 119, 40,  41,  44,  45,  56,  57,  60,  61,  104,
+    105, 108, 109, 120, 121, 124, 125, 42,  43,  46,  47,  58,  59,  62,  63,
+    106, 107, 110, 111, 122, 123, 126, 127, 128, 129, 132, 133, 144, 145, 148,
+    149, 192, 193, 196, 197, 208, 209, 212, 213, 130, 131, 134, 135, 146, 147,
+    150, 151, 194, 195, 198, 199, 210, 211, 214, 215, 136, 137, 140, 141, 152,
+    153, 156, 157, 200, 201, 204, 205, 216, 217, 220, 221, 138, 139, 142, 143,
+    154, 155, 158, 159, 202, 203, 206, 207, 218, 219, 222, 223, 160, 161, 164,
+    165, 176, 177, 180, 181, 224, 225, 228, 229, 240, 241, 244, 245, 162, 163,
+    166, 167, 178, 179, 182, 183, 226, 227, 230, 231, 242, 243, 246, 247, 168,
+    169, 172, 173, 184, 185, 188, 189, 232, 233, 236, 237, 248, 249, 252, 253,
+    170, 171, 174, 175, 186, 187, 190, 191, 234, 235, 238, 239, 250, 251, 254,
+    255};
+
+simdutf_really_inline __m128i lsx_swap_bytes(__m128i vec) {
+  return __lsx_vshuf4i_b(vec, 0b10110001);
+}
+simdutf_really_inline __m256i lasx_swap_bytes(__m256i vec) {
+  return __lasx_xvshuf4i_b(vec, 0b10110001);
+}
+
 simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
-  return input.reduce_or().is_ascii();
+  return input.is_ascii();
 }
 
 simdutf_unused simdutf_really_inline simd8<bool>
 must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2,
                      const simd8<uint8_t> prev3) {
-  simd8<uint8_t> is_second_byte =
-      prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
-  simd8<uint8_t> is_third_byte =
-      prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
-  simd8<uint8_t> is_fourth_byte =
-      prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
-  // Caller requires a bool (all 1's). All values resulting from the subtraction
-  // will be <= 64, so signed comparison is fine.
-  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) >
-         int8_t(0);
+  simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
+  simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
+  simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+  // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller
+  // is using ^ as well. This will work fine because we only have to report
+  // errors for cases with 0-1 lead bytes. Multiple lead bytes implies 2
+  // overlapping multibyte characters, and if that happens, there is guaranteed
+  // to be at least *one* lead byte that is part of only 1 other multibyte
+  // character. The error will be detected there.
+  return is_second_byte ^ is_third_byte ^ is_fourth_byte;
 }
 
 simdutf_really_inline simd8<bool>
 must_be_2_3_continuation(const simd8<uint8_t> prev2,
                          const simd8<uint8_t> prev3) {
-  simd8<uint8_t> is_third_byte =
-      prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be >= 0x80
-  simd8<uint8_t> is_fourth_byte =
-      prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be >= 0x80
-  return simd8<bool>(is_third_byte | is_fourth_byte);
+  simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
+  simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+  return is_third_byte ^ is_fourth_byte;
 }
 
-/* begin file src/westmere/internal/loader.cpp */
-namespace internal {
-namespace westmere {
-
-/* begin file src/westmere/internal/write_v_u16_11bits_to_utf8.cpp */
-/*
- * reads a vector of uint16 values
- * bits after 11th are ignored
- * first 11 bits are encoded into utf8
- * !important! utf8_output must have at least 16 writable bytes
- */
-
-inline void write_v_u16_11bits_to_utf8(const __m128i v_u16, char *&utf8_output,
-                                       const __m128i one_byte_bytemask,
-                                       const uint16_t one_byte_bitmask) {
-  // 0b1100_0000_1000_0000
-  const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
-  // 0b0001_1111_0000_0000
-  const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
-  // 0b0000_0000_0011_1111
-  const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
-
-  // 1. prepare 2-byte values
-  // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-  // expected output   : [110a|aaaa|10bb|bbbb] x 8
-
-  // t0 = [000a|aaaa|bbbb|bb00]
-  const __m128i t0 = _mm_slli_epi16(v_u16, 2);
-  // t1 = [000a|aaaa|0000|0000]
-  const __m128i t1 = _mm_and_si128(t0, v_1f00);
-  // t2 = [0000|0000|00bb|bbbb]
-  const __m128i t2 = _mm_and_si128(v_u16, v_003f);
-  // t3 = [000a|aaaa|00bb|bbbb]
-  const __m128i t3 = _mm_or_si128(t1, t2);
-  // t4 = [110a|aaaa|10bb|bbbb]
-  const __m128i t4 = _mm_or_si128(t3, v_c080);
-
-  // 2. merge ASCII and 2-byte codewords
-  const __m128i utf8_unpacked = _mm_blendv_epi8(t4, v_u16, one_byte_bytemask);
-
-  // 3. prepare bitmask for 8-bit lookup
-  //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a
-  //    - LSB)
-  const uint16_t m0 = one_byte_bitmask & 0x5555;      // m0 = 0h0g0f0e0d0c0b0a
-  const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
-  const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 = hdgcfbea
-  // 4. pack the bytes
-  const uint8_t *row =
-      &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-  const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
-  const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
-
-  // 5. store bytes
-  _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+// common functions for utf8 conversions
+simdutf_really_inline __m128i convert_utf8_3_byte_to_utf16(__m128i in) {
+  // Low half contains  10bbbbbb|10cccccc
+  // High half contains 1110aaaa|1110aaaa
+  const v16u8 sh = {2, 1, 5, 4, 8, 7, 11, 10, 0, 0, 3, 3, 6, 6, 9, 9};
+  const v8u16 v0fff = {0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff};
+
+  __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, (__m128i)sh);
+  // 1110aaaa => aaaa0000
+  __m128i perm_high = __lsx_vslli_b(__lsx_vbsrl_v(perm, 8), 4);
+  // 10bbbbbb 10cccccc => 0010bbbb bbcccccc
+  __m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), /* perm >> 2*/
+                                     perm, __lsx_vrepli_h(0x3f) /* 0x003f */);
+  // 0010bbbb bbcccccc => aaaabbbb bbcccccc
+  composed = __lsx_vbitsel_v(perm_high, composed, (__m128i)v0fff);
 
-  // 6. adjust pointers
-  utf8_output += row[0];
+  return composed;
 }
 
-inline void write_v_u16_11bits_to_utf8(const __m128i v_u16, char *&utf8_output,
-                                       const __m128i v_0000,
-                                       const __m128i v_ff80) {
-  // no bits set above 7th bit
-  const __m128i one_byte_bytemask =
-      _mm_cmpeq_epi16(_mm_and_si128(v_u16, v_ff80), v_0000);
-  const uint16_t one_byte_bitmask =
-      static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
-
-  write_v_u16_11bits_to_utf8(v_u16, utf8_output, one_byte_bytemask,
-                             one_byte_bitmask);
+simdutf_really_inline __m128i convert_utf8_2_byte_to_utf16(__m128i in) {
+  // 10bbbbb 110aaaaa => 00bbbbb 000aaaaa
+  __m128i composed = __lsx_vand_v(in, __lsx_vldi(0x3f));
+  // 00bbbbbb 000aaaaa => 00000aaa aabbbbbb
+  composed = __lsx_vbitsel_v(
+      __lsx_vsrli_h(__lsx_vslli_h(composed, 8), 2), /* (aaaaa << 8) >> 2 */
+      __lsx_vsrli_h(composed, 8),                   /* bbbbbb >> 8 */
+      __lsx_vrepli_h(0x3f));                        /* 0x003f */
+  return composed;
 }
-/* end file src/westmere/internal/write_v_u16_11bits_to_utf8.cpp */
 
-} // namespace westmere
-} // namespace internal
-/* end file src/westmere/internal/loader.cpp */
+simdutf_really_inline __m128i
+convert_utf8_1_to_2_byte_to_utf16(__m128i in, size_t shufutf8_idx) {
+  // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters.
+  // This is a relatively easy scenario
+  // we process SIX (6) input code-code units. The max length in bytes of six
+  // code code units spanning between 1 and 2 bytes each is 12 bytes.
+  __m128i sh =
+      __lsx_vld(reinterpret_cast<const uint8_t *>(
+                    simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx]),
+                0);
+  // Shuffle
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 110aaaaa 10bbbbbb
+  __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh);
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 00000000 00bbbbbb
+  __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_h(0x7f)); // 6 or 7 bits
+  // 1 byte: 00000000 00000000
+  // 2 byte: 00000aaa aa000000
+  __m128i v1f00 = __lsx_vldi(-2785); // -2785(13bit) => 151f
+  __m128i composed = __lsx_vsrli_h(__lsx_vand_v(perm, v1f00), 2); // 5 bits
+  // Combine with a shift right accumulate
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 00000aaa aabbbbbb
+  composed = __lsx_vadd_h(ascii, composed);
+  return composed;
+}
 
-/* begin file src/westmere/sse_validate_utf16.cpp */
+/* begin file src/lasx/lasx_validate_utf16.cpp */
 /*
     In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning.
 
@@ -35344,7 +50076,7 @@ inline void write_v_u16_11bits_to_utf8(const __m128i v_u16, char *&utf8_output,
     - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
     - there must not be sole low surrogate nor high surrogate
 
-    We are going to build three bitmasks based on the 3rd nibble:
+    We're going to build three bitmasks based on the 3rd nibble:
     - V = valid word,
     - L = low surrogate (0xd800 .. 0xdbff)
     - H = high surrogate (0xdc00 .. 0xdfff)
@@ -35371,7 +50103,7 @@ inline void write_v_u16_11bits_to_utf8(const __m128i v_u16, char *&utf8_output,
    - nullptr if an error was detected.
 */
 template <endianness big_endian>
-const char16_t *sse_validate_utf16(const char16_t *input, size_t size) {
+const char16_t *lasx_validate_utf16(const char16_t *input, size_t size) {
   const char16_t *end = input + size;
 
   const auto v_d8 = simd8<uint8_t>::splat(0xd8);
@@ -35379,29 +50111,26 @@ const char16_t *sse_validate_utf16(const char16_t *input, size_t size) {
   const auto v_fc = simd8<uint8_t>::splat(0xfc);
   const auto v_dc = simd8<uint8_t>::splat(0xdc);
 
-  while (input + simd16<uint16_t>::SIZE * 2 < end) {
+  while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
     // 0. Load data: since the validation takes into account only higher
     //    byte of each word, we compress the two vectors into one which
     //    consists only the higher bytes.
     auto in0 = simd16<uint16_t>(input);
-    auto in1 =
-        simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
+    auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
+
     if (big_endian) {
       in0 = in0.swap_bytes();
       in1 = in1.swap_bytes();
     }
 
-    const auto t0 = in0.shr<8>();
-    const auto t1 = in1.shr<8>();
-
-    const auto in = simd16<uint16_t>::pack(t0, t1);
+    const auto in = simd8<uint8_t>(__lasx_xvpermi_d(
+        __lasx_xvssrlni_bu_h(in1.value, in0.value, 8), 0b11011000));
 
     // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
     const auto surrogates_wordmask = (in & v_f8) == v_d8;
-    const uint16_t surrogates_bitmask =
-        static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
-    if (surrogates_bitmask == 0x0000) {
-      input += 16;
+    const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
+    if (surrogates_bitmask == 0x0) {
+      input += simd16<uint16_t>::ELEMENTS * 2;
     } else {
       // 2. We have some surrogates that have to be distinguished:
       //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
@@ -35411,36 +50140,35 @@ const char16_t *sse_validate_utf16(const char16_t *input, size_t size) {
 
       // V - non-surrogate code units
       //     V = not surrogates_wordmask
-      const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
+      const uint32_t V = ~surrogates_bitmask;
 
       // H - word-mask for high surrogates: the six highest bits are 0b1101'11
       const auto vH = (in & v_fc) == v_dc;
-      const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
+      const uint32_t H = vH.to_bitmask();
 
       // L - word mask for low surrogates
       //     L = not H and surrogates_wordmask
-      const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
+      const uint32_t L = ~H & surrogates_bitmask;
 
-      const uint16_t a = static_cast<uint16_t>(
-          L & (H >> 1)); // A low surrogate must be followed by high one.
-                         // (A low surrogate placed in the 7th register's word
-                         // is an exception we handle.)
-      const uint16_t b = static_cast<uint16_t>(
-          a << 1); // Just mark that the opinput - startite fact is hold,
-                   // thanks to that we have only two masks for valid case.
-      const uint16_t c = static_cast<uint16_t>(
-          V | a | b); // Combine all the masks into the final one.
+      const uint32_t a =
+          L & (H >> 1); // A low surrogate must be followed by high one.
+                        // (A low surrogate placed in the 7th register's word
+                        // is an exception we handle.)
+      const uint32_t b =
+          a << 1; // Just mark that the opposite fact is hold,
+                  // thanks to that we have only two masks for valid case.
+      const uint32_t c = V | a | b; // Combine all the masks into the final one.
 
-      if (c == 0xffff) {
+      if (c == 0xffffffff) {
         // The whole input register contains valid UTF-16, i.e.,
         // either single code units or proper surrogate pairs.
-        input += 16;
-      } else if (c == 0x7fff) {
-        // The 15 lower code units of the input register contains valid UTF-16.
-        // The 15th word may be either a low or high surrogate. It the next
+        input += simd16<uint16_t>::ELEMENTS * 2;
+      } else if (c == 0x7fffffff) {
+        // The 31 lower code units of the input register contains valid UTF-16.
+        // The 31 word may be either a low or high surrogate. It the next
         // iteration we 1) check if the low surrogate is followed by a high
         // one, 2) reject sole high surrogate.
-        input += 15;
+        input += simd16<uint16_t>::ELEMENTS * 2 - 1;
       } else {
         return nullptr;
       }
@@ -35451,8 +50179,8 @@ const char16_t *sse_validate_utf16(const char16_t *input, size_t size) {
 }
 
 template <endianness big_endian>
-const result sse_validate_utf16_with_errors(const char16_t *input,
-                                            size_t size) {
+const result lasx_validate_utf16_with_errors(const char16_t *input,
+                                             size_t size) {
   if (simdutf_unlikely(size == 0)) {
     return result(error_code::SUCCESS, 0);
   }
@@ -35464,30 +50192,25 @@ const result sse_validate_utf16_with_errors(const char16_t *input,
   const auto v_fc = simd8<uint8_t>::splat(0xfc);
   const auto v_dc = simd8<uint8_t>::splat(0xdc);
 
-  while (input + simd16<uint16_t>::SIZE * 2 < end) {
+  while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
     // 0. Load data: since the validation takes into account only higher
     //    byte of each word, we compress the two vectors into one which
     //    consists only the higher bytes.
     auto in0 = simd16<uint16_t>(input);
-    auto in1 =
-        simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
+    auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
 
     if (big_endian) {
       in0 = in0.swap_bytes();
       in1 = in1.swap_bytes();
     }
-
-    const auto t0 = in0.shr<8>();
-    const auto t1 = in1.shr<8>();
-
-    const auto in = simd16<uint16_t>::pack(t0, t1);
+    const auto in = simd8<uint8_t>(__lasx_xvpermi_d(
+        __lasx_xvssrlni_bu_h(in1.value, in0.value, 8), 0b11011000));
 
     // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
     const auto surrogates_wordmask = (in & v_f8) == v_d8;
-    const uint16_t surrogates_bitmask =
-        static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
-    if (surrogates_bitmask == 0x0000) {
-      input += 16;
+    const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
+    if (surrogates_bitmask == 0x0) {
+      input += simd16<uint16_t>::ELEMENTS * 2;
     } else {
       // 2. We have some surrogates that have to be distinguished:
       //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
@@ -35497,36 +50220,35 @@ const result sse_validate_utf16_with_errors(const char16_t *input,
 
       // V - non-surrogate code units
       //     V = not surrogates_wordmask
-      const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
+      const uint32_t V = ~surrogates_bitmask;
 
       // H - word-mask for high surrogates: the six highest bits are 0b1101'11
       const auto vH = (in & v_fc) == v_dc;
-      const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
+      const uint32_t H = vH.to_bitmask();
 
       // L - word mask for low surrogates
       //     L = not H and surrogates_wordmask
-      const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
+      const uint32_t L = ~H & surrogates_bitmask;
 
-      const uint16_t a = static_cast<uint16_t>(
-          L & (H >> 1)); // A low surrogate must be followed by high one.
-                         // (A low surrogate placed in the 7th register's word
-                         // is an exception we handle.)
-      const uint16_t b = static_cast<uint16_t>(
-          a << 1); // Just mark that the opinput - startite fact is hold,
-                   // thanks to that we have only two masks for valid case.
-      const uint16_t c = static_cast<uint16_t>(
-          V | a | b); // Combine all the masks into the final one.
+      const uint32_t a =
+          L & (H >> 1); // A low surrogate must be followed by high one.
+                        // (A low surrogate placed in the 7th register's word
+                        // is an exception we handle.)
+      const uint32_t b =
+          a << 1; // Just mark that the opposite fact is hold,
+                  // thanks to that we have only two masks for valid case.
+      const uint32_t c = V | a | b; // Combine all the masks into the final one.
 
-      if (c == 0xffff) {
+      if (c == 0xffffffff) {
         // The whole input register contains valid UTF-16, i.e.,
         // either single code units or proper surrogate pairs.
-        input += 16;
-      } else if (c == 0x7fff) {
-        // The 15 lower code units of the input register contains valid UTF-16.
-        // The 15th word may be either a low or high surrogate. It the next
+        input += simd16<uint16_t>::ELEMENTS * 2;
+      } else if (c == 0x7fffffff) {
+        // The 31 lower code units of the input register contains valid UTF-16.
+        // The 31 word may be either a low or high surrogate. It the next
         // iteration we 1) check if the low surrogate is followed by a high
         // one, 2) reject sole high surrogate.
-        input += 15;
+        input += simd16<uint16_t>::ELEMENTS * 2 - 1;
       } else {
         return result(error_code::SURROGATE, input - start);
       }
@@ -35535,200 +50257,289 @@ const result sse_validate_utf16_with_errors(const char16_t *input,
 
   return result(error_code::SUCCESS, input - start);
 }
-/* end file src/westmere/sse_validate_utf16.cpp */
-/* begin file src/westmere/sse_validate_utf32le.cpp */
-/* Returns:
-   - pointer to the last unprocessed character (a scalar fallback should check
-   the rest);
-   - nullptr if an error was detected.
-*/
-const char32_t *sse_validate_utf32le(const char32_t *input, size_t size) {
+/* end file src/lasx/lasx_validate_utf16.cpp */
+/* begin file src/lasx/lasx_validate_utf32le.cpp */
+
+const char32_t *lasx_validate_utf32le(const char32_t *input, size_t size) {
   const char32_t *end = input + size;
 
-  const __m128i standardmax = _mm_set1_epi32(0x10ffff);
-  const __m128i offset = _mm_set1_epi32(0xffff2000);
-  const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
-  __m128i currentmax = _mm_setzero_si128();
-  __m128i currentoffsetmax = _mm_setzero_si128();
+  // Performance degradation when memory address is not 32-byte aligned
+  while (((uint64_t)input & 0x1F) && input < end) {
+    uint32_t word = *input++;
+    if (word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
+      return nullptr;
+    }
+  }
 
-  while (input + 4 < end) {
-    const __m128i in = _mm_loadu_si128((__m128i *)input);
-    currentmax = _mm_max_epu32(in, currentmax);
+  __m256i offset = __lasx_xvreplgr2vr_w(uint32_t(0xffff2000));
+  __m256i standardoffsetmax = __lasx_xvreplgr2vr_w(uint32_t(0xfffff7ff));
+  __m256i standardmax = __lasx_xvldi(-2288); /*0x10ffff*/
+  __m256i currentmax = __lasx_xvldi(0x0);
+  __m256i currentoffsetmax = __lasx_xvldi(0x0);
+
+  while (input + 8 < end) {
+    __m256i in = __lasx_xvld(reinterpret_cast<const uint32_t *>(input), 0);
+    currentmax = __lasx_xvmax_wu(in, currentmax);
+    // 0xD8__ + 0x2000 = 0xF8__ => 0xF8__ > 0xF7FF
     currentoffsetmax =
-        _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
-    input += 4;
+        __lasx_xvmax_wu(__lasx_xvadd_w(in, offset), currentoffsetmax);
+    input += 8;
   }
-  __m128i is_zero =
-      _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
-  if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
+  __m256i is_zero =
+      __lasx_xvxor_v(__lasx_xvmax_wu(currentmax, standardmax), standardmax);
+  if (__lasx_xbnz_v(is_zero)) {
     return nullptr;
   }
 
-  is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax),
-                          standardoffsetmax);
-  if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
+  is_zero = __lasx_xvxor_v(__lasx_xvmax_wu(currentoffsetmax, standardoffsetmax),
+                           standardoffsetmax);
+  if (__lasx_xbnz_v(is_zero)) {
     return nullptr;
   }
-
   return input;
 }
 
-const result sse_validate_utf32le_with_errors(const char32_t *input,
-                                              size_t size) {
+const result lasx_validate_utf32le_with_errors(const char32_t *input,
+                                               size_t size) {
   const char32_t *start = input;
   const char32_t *end = input + size;
 
-  const __m128i standardmax = _mm_set1_epi32(0x10ffff);
-  const __m128i offset = _mm_set1_epi32(0xffff2000);
-  const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
-  __m128i currentmax = _mm_setzero_si128();
-  __m128i currentoffsetmax = _mm_setzero_si128();
+  // Performance degradation when memory address is not 32-byte aligned
+  while (((uint64_t)input & 0x1F) && input < end) {
+    uint32_t word = *input;
+    if (word > 0x10FFFF) {
+      return result(error_code::TOO_LARGE, input - start);
+    }
+    if (word >= 0xD800 && word <= 0xDFFF) {
+      return result(error_code::SURROGATE, input - start);
+    }
+    input++;
+  }
 
-  while (input + 4 < end) {
-    const __m128i in = _mm_loadu_si128((__m128i *)input);
-    currentmax = _mm_max_epu32(in, currentmax);
+  __m256i offset = __lasx_xvreplgr2vr_w(uint32_t(0xffff2000));
+  __m256i standardoffsetmax = __lasx_xvreplgr2vr_w(uint32_t(0xfffff7ff));
+  __m256i standardmax = __lasx_xvldi(-2288); /*0x10ffff*/
+  __m256i currentmax = __lasx_xvldi(0x0);
+  __m256i currentoffsetmax = __lasx_xvldi(0x0);
+
+  while (input + 8 < end) {
+    __m256i in = __lasx_xvld(reinterpret_cast<const uint32_t *>(input), 0);
+    currentmax = __lasx_xvmax_wu(in, currentmax);
     currentoffsetmax =
-        _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
+        __lasx_xvmax_wu(__lasx_xvadd_w(in, offset), currentoffsetmax);
 
-    __m128i is_zero =
-        _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
-    if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
+    __m256i is_zero =
+        __lasx_xvxor_v(__lasx_xvmax_wu(currentmax, standardmax), standardmax);
+    if (__lasx_xbnz_v(is_zero)) {
       return result(error_code::TOO_LARGE, input - start);
     }
-
-    is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax),
-                            standardoffsetmax);
-    if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
+    is_zero =
+        __lasx_xvxor_v(__lasx_xvmax_wu(currentoffsetmax, standardoffsetmax),
+                       standardoffsetmax);
+    if (__lasx_xbnz_v(is_zero)) {
       return result(error_code::SURROGATE, input - start);
     }
-    input += 4;
+    input += 8;
   }
 
   return result(error_code::SUCCESS, input - start);
 }
-/* end file src/westmere/sse_validate_utf32le.cpp */
-
-/* begin file src/westmere/sse_convert_latin1_to_utf8.cpp */
-std::pair<const char *const, char *const>
-sse_convert_latin1_to_utf8(const char *latin_input,
-                           const size_t latin_input_length, char *utf8_output) {
-  const char *end = latin_input + latin_input_length;
-
-  const __m128i v_0000 = _mm_setzero_si128();
-  // 0b1000_0000
-  const __m128i v_80 = _mm_set1_epi8((uint8_t)0x80);
-  // 0b1111_1111_1000_0000
-  const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
-
-  const __m128i latin_1_half_into_u16_byte_mask =
-      _mm_setr_epi8(0, '\x80', 1, '\x80', 2, '\x80', 3, '\x80', 4, '\x80', 5,
-                    '\x80', 6, '\x80', 7, '\x80');
+/* end file src/lasx/lasx_validate_utf32le.cpp */
 
-  const __m128i latin_2_half_into_u16_byte_mask =
-      _mm_setr_epi8(8, '\x80', 9, '\x80', 10, '\x80', 11, '\x80', 12, '\x80',
-                    13, '\x80', 14, '\x80', 15, '\x80');
+/* begin file src/lasx/lasx_convert_latin1_to_utf8.cpp */
+/*
+  Returns a pair: the first unprocessed byte from buf and utf8_output
+  A scalar routing should carry on the conversion of the tail.
+*/
 
-  // each latin1 takes 1-2 utf8 bytes
-  // slow path writes useful 8-15 bytes twice (eagerly writes 16 bytes and then
-  // adjust the pointer) so the last write can exceed the utf8_output size by
-  // 8-1 bytes by reserving 8 extra input bytes, we expect the output to have
-  // 8-16 bytes free
-  while (end - latin_input >= 16 + 8) {
-    // Load 16 Latin1 characters (16 bytes) into a 128-bit register
-    __m128i v_latin = _mm_loadu_si128((__m128i *)latin_input);
+std::pair<const char *, char *>
+lasx_convert_latin1_to_utf8(const char *latin1_input, size_t len,
+                            char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+  const size_t safety_margin = 12;
+  const char *end = latin1_input + len - safety_margin;
 
-    if (_mm_testz_si128(v_latin, v_80)) { // ASCII fast path!!!!
-      _mm_storeu_si128((__m128i *)utf8_output, v_latin);
-      latin_input += 16;
+  // We always write 16 bytes, of which more than the first 8 bytes
+  // are valid. A safety margin of 8 is more than sufficient.
+  while (latin1_input + 16 <= end) {
+    __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(latin1_input), 0);
+    uint32_t ascii_mask = __lsx_vpickve2gr_wu(__lsx_vmskgez_b(in8), 0);
+    if (ascii_mask == 0xFFFF) {
+      __lsx_vst(in8, utf8_output, 0);
       utf8_output += 16;
+      latin1_input += 16;
       continue;
     }
+    // We just fallback on UTF-16 code. This could be optimized/simplified
+    // further.
+    __m256i in16 = __lasx_vext2xv_hu_bu(____m256i(in8));
+    // 1. prepare 2-byte values
+    // input 8-bit word : [aabb|bbbb] x 16
+    // expected output   : [1100|00aa|10bb|bbbb] x 16
+    // t0 = [0000|00aa|bbbb|bb00]
+    __m256i t0 = __lasx_xvslli_h(in16, 2);
+    // t1 = [0000|00aa|0000|0000]
+    __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785));
+    // t3 = [0000|00aa|00bb|bbbb]
+    __m256i t2 = __lasx_xvbitsel_v(t1, in16, __lasx_xvrepli_h(0x3f));
+    // t4 = [1100|00aa|10bb|bbbb]
+    __m256i t3 = __lasx_xvor_v(t2, __lasx_xvreplgr2vr_h(uint16_t(0xc080)));
+    // merge ASCII and 2-byte codewords
+    __m256i one_byte_bytemask = __lasx_xvsle_hu(in16, __lasx_xvrepli_h(0x7F));
+    __m256i utf8_unpacked = __lasx_xvbitsel_v(t3, in16, one_byte_bytemask);
+
+    const uint8_t *row0 =
+        &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+            [lasx_1_2_utf8_bytes_mask[(ascii_mask & 0xFF)]][0];
+    __m128i shuffle0 = __lsx_vld(row0 + 1, 0);
+    __m128i utf8_unpacked_lo = lasx_extracti128_lo(utf8_unpacked);
+    __m128i utf8_packed0 =
+        __lsx_vshuf_b(utf8_unpacked_lo, utf8_unpacked_lo, shuffle0);
+    __lsx_vst(utf8_packed0, utf8_output, 0);
+    utf8_output += row0[0];
+
+    const uint8_t *row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                              [lasx_1_2_utf8_bytes_mask[(ascii_mask >> 8)]][0];
+    __m128i shuffle1 = __lsx_vld(row1 + 1, 0);
+    __m128i utf8_unpacked_hi = lasx_extracti128_hi(utf8_unpacked);
+    __m128i utf8_packed1 =
+        __lsx_vshuf_b(utf8_unpacked_hi, utf8_unpacked_hi, shuffle1);
+    __lsx_vst(utf8_packed1, utf8_output, 0);
+    utf8_output += row1[0];
 
-    // assuming a/b are bytes and A/B are uint16 of the same value
-    // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
-    __m128i v_u16_latin_1_half =
-        _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask);
-    // aaaa_aaaa_bbbb_bbbb -> BBBB_BBBB
-    __m128i v_u16_latin_2_half =
-        _mm_shuffle_epi8(v_latin, latin_2_half_into_u16_byte_mask);
+    latin1_input += 16;
+  } // while
 
-    internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_1_half,
-                                                   utf8_output, v_0000, v_ff80);
-    internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_2_half,
-                                                   utf8_output, v_0000, v_ff80);
-    latin_input += 16;
+  return std::make_pair(latin1_input, reinterpret_cast<char *>(utf8_output));
+}
+/* end file src/lasx/lasx_convert_latin1_to_utf8.cpp */
+/* begin file src/lasx/lasx_convert_latin1_to_utf16.cpp */
+std::pair<const char *, char16_t *>
+lasx_convert_latin1_to_utf16le(const char *buf, size_t len,
+                               char16_t *utf16_output) {
+  const char *end = buf + len;
+
+  // Performance degradation when memory address is not 32-byte aligned
+  while (((uint64_t)utf16_output & 0x1F) && buf < end) {
+    *utf16_output++ = uint8_t(*buf) & 0xFF;
+    buf++;
   }
 
-  if (end - latin_input >= 16) {
-    // Load 16 Latin1 characters (16 bytes) into a 128-bit register
-    __m128i v_latin = _mm_loadu_si128((__m128i *)latin_input);
+  while (buf + 32 <= end) {
+    __m256i in8 = __lasx_xvld(reinterpret_cast<const uint8_t *>(buf), 0);
 
-    if (_mm_testz_si128(v_latin, v_80)) { // ASCII fast path!!!!
-      _mm_storeu_si128((__m128i *)utf8_output, v_latin);
-      latin_input += 16;
-      utf8_output += 16;
-    } else {
-      // assuming a/b are bytes and A/B are uint16 of the same value
-      // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
-      __m128i v_u16_latin_1_half =
-          _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask);
-      internal::westmere::write_v_u16_11bits_to_utf8(
-          v_u16_latin_1_half, utf8_output, v_0000, v_ff80);
-      latin_input += 8;
-    }
+    __m256i inlow = __lasx_vext2xv_hu_bu(in8);
+    __m256i in8_high = __lasx_xvpermi_q(in8, in8, 0b00000001);
+    __m256i inhigh = __lasx_vext2xv_hu_bu(in8_high);
+    __lasx_xvst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    __lasx_xvst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 32);
+
+    utf16_output += 32;
+    buf += 32;
   }
 
-  return std::make_pair(latin_input, utf8_output);
+  if (buf + 16 <= end) {
+    __m128i zero = __lsx_vldi(0);
+    __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
+
+    __m128i inlow = __lsx_vilvl_b(zero, in8);
+    __m128i inhigh = __lsx_vilvh_b(zero, in8);
+    __lsx_vst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    __lsx_vst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 16);
+
+    utf16_output += 16;
+    buf += 16;
+  }
+  return std::make_pair(buf, utf16_output);
 }
-/* end file src/westmere/sse_convert_latin1_to_utf8.cpp */
-/* begin file src/westmere/sse_convert_latin1_to_utf16.cpp */
-template <endianness big_endian>
+
 std::pair<const char *, char16_t *>
-sse_convert_latin1_to_utf16(const char *latin1_input, size_t len,
-                            char16_t *utf16_output) {
-  size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
-  for (size_t i = 0; i < rounded_len; i += 16) {
-    // Load 16 Latin1 characters into a 128-bit register
-    __m128i in =
-        _mm_loadu_si128(reinterpret_cast<const __m128i *>(&latin1_input[i]));
-    __m128i out1 = big_endian ? _mm_unpacklo_epi8(_mm_setzero_si128(), in)
-                              : _mm_unpacklo_epi8(in, _mm_setzero_si128());
-    __m128i out2 = big_endian ? _mm_unpackhi_epi8(_mm_setzero_si128(), in)
-                              : _mm_unpackhi_epi8(in, _mm_setzero_si128());
-    // Zero extend each Latin1 character to 16-bit integers and store the
-    // results back to memory
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(&utf16_output[i]), out1);
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(&utf16_output[i + 8]), out2);
+lasx_convert_latin1_to_utf16be(const char *buf, size_t len,
+                               char16_t *utf16_output) {
+  const char *end = buf + len;
+
+  while (((uint64_t)utf16_output & 0x1F) && buf < end) {
+    *utf16_output++ = (uint16_t(*buf++) << 8);
   }
-  // return pointers pointing to where we left off
-  return std::make_pair(latin1_input + rounded_len, utf16_output + rounded_len);
+
+  __m256i zero = __lasx_xvldi(0);
+  while (buf + 32 <= end) {
+    __m256i in8 = __lasx_xvld(reinterpret_cast<const uint8_t *>(buf), 0);
+
+    __m256i in8_shuf = __lasx_xvpermi_d(in8, 0b11011000);
+
+    __m256i inlow = __lasx_xvilvl_b(in8_shuf, zero);
+    __m256i inhigh = __lasx_xvilvh_b(in8_shuf, zero);
+    __lasx_xvst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    __lasx_xvst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 32);
+    utf16_output += 32;
+    buf += 32;
+  }
+
+  if (buf + 16 <= end) {
+    __m128i zero_128 = __lsx_vldi(0);
+    __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
+
+    __m128i inlow = __lsx_vilvl_b(in8, zero_128);
+    __m128i inhigh = __lsx_vilvh_b(in8, zero_128);
+    __lsx_vst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    __lsx_vst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 16);
+    utf16_output += 16;
+    buf += 16;
+  }
+
+  return std::make_pair(buf, utf16_output);
 }
-/* end file src/westmere/sse_convert_latin1_to_utf16.cpp */
-/* begin file src/westmere/sse_convert_latin1_to_utf32.cpp */
+/* end file src/lasx/lasx_convert_latin1_to_utf16.cpp */
+/* begin file src/lasx/lasx_convert_latin1_to_utf32.cpp */
 std::pair<const char *, char32_t *>
-sse_convert_latin1_to_utf32(const char *buf, size_t len,
-                            char32_t *utf32_output) {
+lasx_convert_latin1_to_utf32(const char *buf, size_t len,
+                             char32_t *utf32_output) {
   const char *end = buf + len;
 
-  while (end - buf >= 16) {
-    // Load 16 Latin1 characters (16 bytes) into a 128-bit register
-    __m128i in = _mm_loadu_si128((__m128i *)buf);
+  // LASX requires 32-byte alignment, otherwise performance will be degraded
+  while (((uint64_t)utf32_output & 0x1F) && buf < end) {
+    *utf32_output++ = ((uint32_t)*buf) & 0xFF;
+    buf++;
+  }
 
-    // Shift input to process next 4 bytes
-    __m128i in_shifted1 = _mm_srli_si128(in, 4);
-    __m128i in_shifted2 = _mm_srli_si128(in, 8);
-    __m128i in_shifted3 = _mm_srli_si128(in, 12);
+  while (buf + 32 <= end) {
+    __m256i in8 = __lasx_xvld(reinterpret_cast<const uint8_t *>(buf), 0);
 
-    // expand 8-bit to 32-bit unit
-    __m128i out1 = _mm_cvtepu8_epi32(in);
-    __m128i out2 = _mm_cvtepu8_epi32(in_shifted1);
-    __m128i out3 = _mm_cvtepu8_epi32(in_shifted2);
-    __m128i out4 = _mm_cvtepu8_epi32(in_shifted3);
+    __m256i in32_0 = __lasx_vext2xv_wu_bu(in8);
+    __lasx_xvst(in32_0, reinterpret_cast<uint32_t *>(utf32_output), 0);
 
-    _mm_storeu_si128((__m128i *)utf32_output, out1);
-    _mm_storeu_si128((__m128i *)(utf32_output + 4), out2);
-    _mm_storeu_si128((__m128i *)(utf32_output + 8), out3);
-    _mm_storeu_si128((__m128i *)(utf32_output + 12), out4);
+    __m256i in8_1 = __lasx_xvpermi_d(in8, 0b00000001);
+    __m256i in32_1 = __lasx_vext2xv_wu_bu(in8_1);
+    __lasx_xvst(in32_1, reinterpret_cast<uint32_t *>(utf32_output), 32);
+
+    __m256i in8_2 = __lasx_xvpermi_d(in8, 0b00000010);
+    __m256i in32_2 = __lasx_vext2xv_wu_bu(in8_2);
+    __lasx_xvst(in32_2, reinterpret_cast<uint32_t *>(utf32_output), 64);
+
+    __m256i in8_3 = __lasx_xvpermi_d(in8, 0b00000011);
+    __m256i in32_3 = __lasx_vext2xv_wu_bu(in8_3);
+    __lasx_xvst(in32_3, reinterpret_cast<uint32_t *>(utf32_output), 96);
+
+    utf32_output += 32;
+    buf += 32;
+  }
+
+  if (buf + 16 <= end) {
+    __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
+
+    __m128i zero = __lsx_vldi(0);
+    __m128i in16low = __lsx_vilvl_b(zero, in8);
+    __m128i in16high = __lsx_vilvh_b(zero, in8);
+    __m128i in32_0 = __lsx_vilvl_h(zero, in16low);
+    __m128i in32_1 = __lsx_vilvh_h(zero, in16low);
+    __m128i in32_2 = __lsx_vilvl_h(zero, in16high);
+    __m128i in32_3 = __lsx_vilvh_h(zero, in16high);
+
+    __lsx_vst(in32_0, reinterpret_cast<uint32_t *>(utf32_output), 0);
+    __lsx_vst(in32_1, reinterpret_cast<uint32_t *>(utf32_output), 16);
+    __lsx_vst(in32_2, reinterpret_cast<uint32_t *>(utf32_output), 32);
+    __lsx_vst(in32_3, reinterpret_cast<uint32_t *>(utf32_output), 48);
 
     utf32_output += 16;
     buf += 16;
@@ -35736,15 +50547,13 @@ sse_convert_latin1_to_utf32(const char *buf, size_t len,
 
   return std::make_pair(buf, utf32_output);
 }
-/* end file src/westmere/sse_convert_latin1_to_utf32.cpp */
-
-/* begin file src/westmere/sse_convert_utf8_to_utf16.cpp */
-// depends on "tables/utf8_to_utf16_tables.h"
+/* end file src/lasx/lasx_convert_latin1_to_utf32.cpp */
 
-// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
+/* begin file src/lasx/lasx_convert_utf8_to_utf16.cpp */
+// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
 // end of the code points. Only the least significant 12 bits of the mask
 // are accessed.
-// It returns how many bytes were consumed (up to 12).
+// It returns how many bytes were consumed (up to 16, usually 12).
 template <endianness big_endian>
 size_t convert_masked_utf8_to_utf16(const char *input,
                                     uint64_t utf8_end_of_code_point_mask,
@@ -35753,204 +50562,304 @@ size_t convert_masked_utf8_to_utf16(const char *input,
   // Why 12 input bytes and not 16? Because we are concerned with the size of
   // the lookup tables. Also 12 is nicely divisible by two and three.
   //
+  __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xfff;
   //
   // Optimization note: our main path below is load-latency dependent. Thus it
   // is maybe beneficial to have fast paths that depend on branch prediction but
   // have less latency. This results in more instructions but, potentially, also
   // higher speeds.
-  //
+
   // We first try a few fast paths.
-  const __m128i swap =
-      _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-  const __m128i in = _mm_loadu_si128((__m128i *)input);
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask & 0xfff;
-  if (utf8_end_of_code_point_mask == 0xfff) {
-    // We process the data in chunks of 12 bytes.
-    // Note: using 16 bytes is unsafe, see issue_ossfuzz_71218
-    __m128i ascii_first = _mm_cvtepu8_epi16(in);
-    __m128i ascii_second = _mm_cvtepu8_epi16(_mm_srli_si128(in, 8));
-    if (big_endian) {
-      ascii_first = _mm_shuffle_epi8(ascii_first, swap);
-      ascii_second = _mm_shuffle_epi8(ascii_second, swap);
+  // The obvious first test is ASCII, which actually consumes the full 16.
+  if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) {
+    __m128i zero = __lsx_vldi(0);
+    if (match_system(big_endian)) {
+      __lsx_vst(__lsx_vilvl_b(zero, in),
+                reinterpret_cast<uint16_t *>(utf16_output), 0);
+      __lsx_vst(__lsx_vilvh_b(zero, in),
+                reinterpret_cast<uint16_t *>(utf16_output), 16);
+    } else {
+      __lsx_vst(__lsx_vilvl_b(in, zero),
+                reinterpret_cast<uint16_t *>(utf16_output), 0);
+      __lsx_vst(__lsx_vilvh_b(in, zero),
+                reinterpret_cast<uint16_t *>(utf16_output), 16);
     }
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output), ascii_first);
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + 8),
-                     ascii_second);
-    utf16_output += 12; // We wrote 12 16-bit characters.
-    return 12;          // We consumed 12 bytes.
+    utf16_output += 16; // We wrote 16 16-bit characters.
+    return 16;          // We consumed 16 bytes.
   }
-  if (((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) {
-    // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte
-    // UTF-16 code units. There is probably a more efficient sequence, but the
-    // following might do.
-    const __m128i sh =
-        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    if (big_endian)
-      composed = _mm_shuffle_epi8(composed, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed);
-    utf16_output += 8; // We wrote 16 bytes, 8 code points.
-    return 16;
+
+  // 3 byte sequences are the next most common, as seen in CJK, which has long
+  // sequences of these.
+  if (input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
+    // UTF-16 code units.
+    __m128i composed = convert_utf8_3_byte_to_utf16(in);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed = lsx_swap_bytes(composed);
+    }
+
+    __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    utf16_output += 4; // We wrote 4 16-bit characters.
+    return 12;         // We consumed 12 bytes.
   }
-  if (input_utf8_end_of_code_point_mask == 0x924) {
-    // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
-    // UTF-16 code units. There is probably a more efficient sequence, but the
-    // following might do.
-    const __m128i sh =
-        _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-    __m128i composed_repacked = _mm_packus_epi32(composed, composed);
-    if (big_endian)
-      composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
-    utf16_output += 4;
-    return 12;
+
+  // 2 byte sequences occur in short bursts in languages like Greek and Russian.
+  if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xAAAA) {
+    // We want to take 6 2-byte UTF-8 code units and turn them into 6 2-byte
+    // UTF-16 code units.
+    __m128i composed = convert_utf8_2_byte_to_utf16(in);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed = lsx_swap_bytes(composed);
+    }
+
+    __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    utf16_output += 8; // We wrote 6 16-bit characters.
+    return 16;         // We consumed 12 bytes.
   }
-  /// We do not have a fast path available, so we fallback.
 
-  const uint8_t idx =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+  /// We do not have a fast path available, or the fast path is unimportant, so
+  /// we fallback.
+  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][0];
+
+  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][1];
+  const __m128i zero = __lsx_vldi(0);
   if (idx < 64) {
     // SIX (6) input code-code units
-    // this is a relatively easy scenario
-    // we process SIX (6) input code-code units. The max length in bytes of six
-    // code code units spanning between 1 and 2 bytes each is 12 bytes. On
-    // processors where pdep/pext is fast, we might be able to use a small
-    // lookup table.
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    if (big_endian)
-      composed = _mm_shuffle_epi8(composed, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed);
-    utf16_output += 6; // We wrote 12 bytes, 6 code points.
+    // Convert to UTF-16
+    __m128i composed = convert_utf8_1_to_2_byte_to_utf16(in, idx);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed = lsx_swap_bytes(composed);
+    }
+    // Store
+    __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    utf16_output += 6; // We wrote 6 16-bit characters.
+    return consumed;
   } else if (idx < 145) {
     // FOUR (4) input code-code units
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-    __m128i composed_repacked = _mm_packus_epi32(composed, composed);
-    if (big_endian)
-      composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
-    utf16_output += 4;
+    // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
+    __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+                               simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+                           0);
+    // XXX: depending on the system scalar instructions might be faster.
+    // 1 byte: 00000000 00000000 0ccccccc
+    // 2 byte: 00000000 110bbbbb 10cccccc
+    // 3 byte: 1110aaaa 10bbbbbb 10cccccc
+    sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+    __m128i perm = __lsx_vshuf_b(zero, in, sh);
+    // 1 byte: 00000000 0ccccccc
+    // 2 byte: xx0bbbbb x0cccccc
+    // 3 byte: xxbbbbbb x0cccccc
+    __m128i lowperm = __lsx_vpickev_h(perm, perm);
+    // 1 byte: 00000000 00000000
+    // 2 byte: 00000000 00000000
+    // 3 byte: 00000000 1110aaaa
+    __m128i highperm = __lsx_vpickod_h(perm, perm);
+    // 3 byte: aaaa0000 00000000
+    highperm = __lsx_vslli_h(highperm, 12);
+    // ASCII
+    // 1 byte: 00000000 0ccccccc
+    // 2+byte: 00000000 00cccccc
+    __m128i ascii = __lsx_vand_v(lowperm, __lsx_vrepli_h(0x7f));
+    // 1 byte: 00000000 00000000
+    // 2 byte: xx0bbbbb 00000000
+    // 3 byte: xxbbbbbb 00000000
+    __m128i middlebyte = __lsx_vand_v(lowperm, __lsx_vldi(-2561) /*0xFF00*/);
+    // 1 byte: 00000000 0ccccccc
+    // 2 byte: 0010bbbb bbcccccc
+    // 3 byte: 0010bbbb bbcccccc
+    __m128i composed = __lsx_vor_v(__lsx_vsrli_h(middlebyte, 2), ascii);
+
+    __m128i v0fff = __lsx_vreplgr2vr_h(uint16_t(0xfff));
+    // aaaabbbb bbcccccc
+    composed = __lsx_vbitsel_v(highperm, composed, v0fff);
+
+    if (!match_system(big_endian)) {
+      composed = lsx_swap_bytes(composed);
+    }
+
+    __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    utf16_output += 4; // We wrote 4 16-bit codepoints
+    return consumed;
   } else if (idx < 209) {
-    // TWO (2) input code-code units
-    //////////////
-    // There might be garbage inputs where a leading byte mascarades as a
-    // four-byte leading byte (by being followed by 3 continuation byte), but is
-    // not greater than 0xf0. This could trigger a buffer overflow if we only
-    // counted leading bytes of the form 0xf0 as generating surrogate pairs,
-    // without further UTF-8 validation. Thus we must be careful to ensure that
-    // only leading bytes at least as large as 0xf0 generate surrogate pairs. We
-    // do as at the cost of an extra mask.
-    /////////////
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
-    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
-    // correct for spurious high bit
-    const __m128i correct =
-        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
-    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
-    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
-    // We deliberately carry the leading four bits in highbyte if they are
-    // present, we remove them later when computing hightenbits.
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
-    // When we need to generate a surrogate pair (leading byte > 0xF0), then
-    // the corresponding 32-bit value in 'composed'  will be greater than
-    // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
-    // location of the surrogate pairs.
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
-                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
-    const __m128i composedminus =
-        _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
-    const __m128i lowtenbits =
-        _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
-    // Notice the 0x3ff mask:
-    const __m128i hightenbits =
-        _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
-    const __m128i lowtenbitsadd =
-        _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
-    const __m128i hightenbitsadd =
-        _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
-    const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
-    __m128i surrogates = _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
-    uint32_t basic_buffer[4];
-    uint32_t basic_buffer_swap[4];
-    if (big_endian) {
-      _mm_storeu_si128((__m128i *)basic_buffer_swap,
-                       _mm_shuffle_epi8(composed, swap));
-      surrogates = _mm_shuffle_epi8(surrogates, swap);
+    // THREE (3) input code-code units
+    if (input_utf8_end_of_code_point_mask == 0x888) {
+      // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
+      // UTF-16 pairs. Generating surrogate pairs is a little tricky though, but
+      // it is easier when we can assume they are all pairs. This version does
+      // not use the LUT, but 4 byte sequences are less common and the overhead
+      // of the extra memory access is less important than the early branch
+      // overhead in shorter sequences.
+
+      // Swap byte pairs
+      // 10dddddd 10cccccc|10bbbbbb 11110aaa
+      // 10cccccc 10dddddd|11110aaa 10bbbbbb
+      __m128i swap = lsx_swap_bytes(in);
+      // Shift left 2 bits
+      // cccccc00 dddddd00 xxxxxxxx bbbbbb00
+      __m128i shift = __lsx_vslli_b(swap, 2);
+      // Create a magic number containing the low 2 bits of the trail surrogate
+      // and all the corrections needed to create the pair. UTF-8 4b prefix   =
+      // -0x0000|0xF000 surrogate offset  = -0x0000|0x0040 (0x10000 << 6)
+      // surrogate high    = +0x0000|0xD800
+      // surrogate low     = +0xDC00|0x0000
+      // -------------------------------
+      //                   = +0xDC00|0xE7C0
+      __m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xDC00E7C0));
+      // Generate unadjusted trail surrogate minus lowest 2 bits
+      // vec(0000FF00) = __lsx_vldi(-1758)
+      // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00
+      __m128i trail =
+          __lsx_vbitsel_v(shift, swap, __lsx_vldi(-1758 /*0000FF00*/));
+      // Insert low 2 bits of trail surrogate to magic number for later
+      // 11011100 00000000 11100111 110000cc
+      __m128i magic_with_low_2 = __lsx_vor_v(__lsx_vsrli_w(shift, 30), magic);
+
+      // Generate lead surrogate
+      // xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx
+      // 000000cc ccdddddd|xxxxxxxx xxxxxxxx
+      __m128i lead = __lsx_vbitsel_v(
+          __lsx_vsrli_h(__lsx_vand_v(shift, __lsx_vldi(0x3F)), 4), swap,
+          __lsx_vrepli_h(0x3f /* 0x003f*/));
+
+      // Blend pairs
+      // __lsx_vldi(-1741) => vec(0x0000FFFF)
+      // 000000cc ccdddddd|11110aaa bbbbbb00
+      __m128i blend =
+          __lsx_vbitsel_v(lead, trail, __lsx_vldi(-1741) /* (0x0000FFFF)*4 */);
+
+      // Add magic number to finish the result
+      // 110111CC CCDDDDDD|110110AA BBBBBBCC
+      __m128i composed = __lsx_vadd_h(blend, magic_with_low_2);
+      // Byte swap if necessary
+      if (!match_system(big_endian)) {
+        composed = lsx_swap_bytes(composed);
+      }
+      __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+      utf16_output += 6; // We 3 32-bit surrogate pairs.
+      return 12;         // We consumed 12 bytes.
     }
-    _mm_storeu_si128((__m128i *)basic_buffer, composed);
-    uint32_t surrogate_buffer[4];
-    _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
+    // 3 1-4 byte sequences
+    __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+                               simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+                           0);
+    // 1 byte: 00000000 00000000 00000000 0ddddddd
+    // 3 byte: 00000000 00000000 110ccccc 10dddddd
+    // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
+    // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
+    sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+    __m128i perm = __lsx_vshuf_b(zero, in, sh);
+    // added to fix issue https://github.com/simdutf/simdutf/issues/514
+    // We only want to write 2 * 16-bit code units when that is actually what we
+    // have. Unfortunately, we cannot trust the input. So it is possible to get
+    // 0xff as an input byte and it should not result in a surrogate pair. We
+    // need to check for that.
+    uint32_t permbuffer[4];
+    __lsx_vst(perm, permbuffer, 0);
+    // Mask the low and middle bytes
+    // 00000000 00000000 00000000 0ddddddd
+    __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7f));
+    // Because the surrogates need more work, the high surrogate is computed
+    // first.
+    __m128i middlehigh = __lsx_vslli_w(perm, 2);
+    // 00000000 00000000 00cccccc 00000000
+    __m128i middlebyte = __lsx_vand_v(perm, __lsx_vldi(-3777) /* 0x00003F00 */);
+    // Start assembling the sequence. Since the 4th byte is in the same position
+    // as it would be in a surrogate and there is no dependency, shift left
+    // instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte:
+    // 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx
+    __m128i ab =
+        __lsx_vbitsel_v(middlehigh, perm, __lsx_vldi(-1656) /*0xFF000000*/);
+    // Top 16 bits contains the high ten bits of the surrogate pair before
+    // correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa
+    // bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction
+    __m128i v_fffc0000 = __lsx_vreplgr2vr_w(uint32_t(0xFFFC0000));
+    __m128i abc = __lsx_vbitsel_v(__lsx_vslli_w(middlebyte, 4), ab, v_fffc0000);
+    // Combine the low 6 or 7 bits by a shift right accumulate
+    // 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct
+    // 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o
+    // correction
+    __m128i composed = __lsx_vor_v(ascii, __lsx_vsrli_w(abc, 6));
+    // After this is for surrogates
+    // Blend the low and high surrogates
+    // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd
+    __m128i mixed =
+        __lsx_vbitsel_v(abc, composed, __lsx_vldi(-1741) /*0x0000FFFF*/);
+    // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits
+    // yet as 0x10000 was not subtracted from the codepoint yet. 4 byte:
+    // 11110aaa bbbbbbcc|000000cc ccdddddd
+    __m128i v_ffff03ff = __lsx_vreplgr2vr_w(uint32_t(0xFFFF03FF));
+    __m128i masked_pair = __lsx_vand_v(mixed, v_ffff03ff);
+    // Correct the remaining UTF-8 prefix, surrogate offset, and add the
+    // surrogate prefixes in one magic 16-bit addition. similar magic number but
+    // without the continue byte adjust and halfword swapped UTF-8 4b prefix   =
+    // -0xF000|0x0000 surrogate offset  = -0x0040|0x0000 (0x10000 << 6)
+    // surrogate high    = +0xD800|0x0000
+    // surrogate low     = +0x0000|0xDC00
+    // -----------------------------------
+    //                   = +0xE7C0|0xDC00
+    __m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xE7C0DC00));
+    // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete
+    __m128i surrogates = __lsx_vadd_w(masked_pair, magic);
+    // If the high bit is 1 (s32 less than zero), this needs a surrogate pair
+    __m128i is_pair = __lsx_vslt_w(perm, zero);
+    // Select either the 4 byte surrogate pair or the 2 byte solo codepoint
+    // 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd
+    // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD
+    __m128i selected = __lsx_vbitsel_v(composed, surrogates, is_pair);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      selected = lsx_swap_bytes(selected);
+    }
+    // Attempting to shuffle and store would be complex, just scalarize.
+    uint32_t buffer_tmp[4];
+    __lsx_vst(selected, buffer_tmp, 0);
+    // Test for the top bit of the surrogate mask. Remove due to issue 514
+    // const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 :
+    // 0x00800000;
     for (size_t i = 0; i < 3; i++) {
-      if (basic_buffer[i] > 0x3c00000) {
-        utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
-        utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
+      // Surrogate
+      // Used to be if (buffer[i] & SURROGATE_MASK) {
+      // See discussion above.
+      // patch for issue https://github.com/simdutf/simdutf/issues/514
+      if ((permbuffer[i] & 0xf8000000) == 0xf0000000) {
+        utf16_output[0] = uint16_t(buffer_tmp[i] >> 16);
+        utf16_output[1] = uint16_t(buffer_tmp[i] & 0xFFFF);
         utf16_output += 2;
       } else {
-        utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i])
-                                     : uint16_t(basic_buffer[i]);
+        utf16_output[0] = uint16_t(buffer_tmp[i] & 0xFFFF);
         utf16_output++;
       }
     }
+    return consumed;
   } else {
     // here we know that there is an error but we do not handle errors
+    return 12;
   }
-  return consumed;
 }
-/* end file src/westmere/sse_convert_utf8_to_utf16.cpp */
-/* begin file src/westmere/sse_convert_utf8_to_utf32.cpp */
-// depends on "tables/utf8_to_utf16_tables.h"
-
+/* end file src/lasx/lasx_convert_utf8_to_utf16.cpp */
+/* begin file src/lasx/lasx_convert_utf8_to_utf32.cpp */
 // Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
 // end of the code points. Only the least significant 12 bits of the mask
 // are accessed.
 // It returns how many bytes were consumed (up to 12).
 size_t convert_masked_utf8_to_utf32(const char *input,
                                     uint64_t utf8_end_of_code_point_mask,
-                                    char32_t *&utf32_output) {
+                                    char32_t *&utf32_out) {
   // we use an approach where we try to process up to 12 input bytes.
   // Why 12 input bytes and not 16? Because we are concerned with the size of
   // the lookup tables. Also 12 is nicely divisible by two and three.
   //
+  uint32_t *&utf32_output = reinterpret_cast<uint32_t *&>(utf32_out);
+  __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xFFF;
   //
   // Optimization note: our main path below is load-latency dependent. Thus it
   // is maybe beneficial to have fast paths that depend on branch prediction but
@@ -35958,135 +50867,179 @@ size_t convert_masked_utf8_to_utf32(const char *input,
   // higher speeds.
   //
   // We first try a few fast paths.
-  const __m128i in = _mm_loadu_si128((__m128i *)input);
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask & 0xfff;
-  if (utf8_end_of_code_point_mask == 0xfff) {
-    // We process the data in chunks of 12 bytes.
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
-                     _mm_cvtepu8_epi32(in));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
-                     _mm_cvtepu8_epi32(_mm_srli_si128(in, 4)));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 8),
-                     _mm_cvtepu8_epi32(_mm_srli_si128(in, 8)));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 12),
-                     _mm_cvtepu8_epi32(_mm_srli_si128(in, 12)));
-    utf32_output += 12; // We wrote 12 32-bit characters.
-    return 12;          // We consumed 12 bytes.
-  }
-  if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
-    // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte
-    // UTF-32 code units. There is probably a more efficient sequence, but the
-    // following might do.
-    const __m128i sh =
-        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
-                     _mm_cvtepu16_epi32(composed));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
-                     _mm_cvtepu16_epi32(_mm_srli_si128(composed, 8)));
-    utf32_output += 8; // We wrote 32 bytes, 8 code points.
-    return 16;
+  if ((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
+    // We process in chunks of 16 bytes.
+    // use fast implementation in src/simdutf/arm64/simd.h
+    // Ideally the compiler can keep the tables in registers.
+    __m128i zero = __lsx_vldi(0);
+    __m128i in16low = __lsx_vilvl_b(zero, in);
+    __m128i in16high = __lsx_vilvh_b(zero, in);
+    __m128i in32_0 = __lsx_vilvl_h(zero, in16low);
+    __m128i in32_1 = __lsx_vilvh_h(zero, in16low);
+    __m128i in32_2 = __lsx_vilvl_h(zero, in16high);
+    __m128i in32_3 = __lsx_vilvh_h(zero, in16high);
+
+    __lsx_vst(in32_0, reinterpret_cast<uint32_t *>(utf32_output), 0);
+    __lsx_vst(in32_1, reinterpret_cast<uint32_t *>(utf32_output), 16);
+    __lsx_vst(in32_2, reinterpret_cast<uint32_t *>(utf32_output), 32);
+    __lsx_vst(in32_3, reinterpret_cast<uint32_t *>(utf32_output), 48);
+
+    utf32_output += 16; // We wrote 16 32-bit characters.
+    return 16;          // We consumed 16 bytes.
   }
+  __m128i zero = __lsx_vldi(0);
   if (input_utf8_end_of_code_point_mask == 0x924) {
     // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
-    // UTF-32 code units. There is probably a more efficient sequence, but the
-    // following might do.
-    const __m128i sh =
-        _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-    _mm_storeu_si128((__m128i *)utf32_output, composed);
-    utf32_output += 4;
-    return 12;
+    // UTF-32 code units. Convert to UTF-16
+    __m128i composed_utf16 = convert_utf8_3_byte_to_utf16(in);
+    __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
+
+    __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
+    utf32_output += 4; // We wrote 4 32-bit characters.
+    return 12;         // We consumed 12 bytes.
   }
-  /// We do not have a fast path available, so we fallback.
+  // 2 byte sequences occur in short bursts in languages like Greek and Russian.
+  if (input_utf8_end_of_code_point_mask == 0xaaa) {
+    // We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte
+    // UTF-32 code units. Convert to UTF-16
+    __m128i composed_utf16 = convert_utf8_2_byte_to_utf16(in);
+
+    __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
+    __m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16);
+
+    __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
+    __lsx_vst(utf32_high, reinterpret_cast<uint32_t *>(utf32_output), 16);
+    utf32_output += 6;
+    return 12; // We consumed 12 bytes.
+  }
+  // Either no fast path or an unimportant fast path.
+
+  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][1];
 
-  const uint8_t idx =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
   if (idx < 64) {
     // SIX (6) input code-code units
-    // this is a relatively easy scenario
-    // we process SIX (6) input code-code units. The max length in bytes of six
-    // code code units spanning between 1 and 2 bytes each is 12 bytes. On
-    // processors where pdep/pext is fast, we might be able to use a small
-    // lookup table.
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
-                     _mm_cvtepu16_epi32(composed));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
-                     _mm_cvtepu16_epi32(_mm_srli_si128(composed, 8)));
-    utf32_output += 6; // We wrote 12 bytes, 6 code points.
+    // Convert to UTF-16
+    __m128i composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx);
+    __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
+    __m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16);
+
+    __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
+    __lsx_vst(utf32_high, reinterpret_cast<uint32_t *>(utf32_output), 16);
+    utf32_output += 6;
+    return consumed;
   } else if (idx < 145) {
     // FOUR (4) input code-code units
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-    _mm_storeu_si128((__m128i *)utf32_output, composed);
-    utf32_output += 4;
+    // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
+    __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+                               simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+                           0);
+    // Shuffle
+    // 1 byte: 00000000 00000000 0ccccccc
+    // 2 byte: 00000000 110bbbbb 10cccccc
+    // 3 byte: 1110aaaa 10bbbbbb 10cccccc
+    sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+    __m128i perm = __lsx_vshuf_b(zero, in, sh);
+    // Split
+    // 00000000 00000000 0ccccccc
+    __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F)); // 6 or 7 bits
+    // Note: unmasked
+    // xxxxxxxx aaaaxxxx xxxxxxxx
+    __m128i high =
+        __lsx_vsrli_w(__lsx_vand_v(perm, __lsx_vldi(0xf)), 4); // 4 bits
+    // Use 16 bit bic instead of and.
+    // The top bits will be corrected later in the bsl
+    // 00000000 10bbbbbb 00000000
+    __m128i middle =
+        __lsx_vand_v(perm, __lsx_vldi(-1758 /*0x0000FF00*/)); // 5 or 6 bits
+    // Combine low and middle with shift right accumulate
+    // 00000000 00xxbbbb bbcccccc
+    __m128i lowmid = __lsx_vor_v(ascii, __lsx_vsrli_w(middle, 2));
+    // Insert top 4 bits from high byte with bitwise select
+    // 00000000 aaaabbbb bbcccccc
+    __m128i composed =
+        __lsx_vbitsel_v(lowmid, high, __lsx_vldi(-3600 /*0x0000F000*/));
+    __lsx_vst(composed, utf32_output, 0);
+    utf32_output += 4; // We wrote 4 32-bit characters.
+    return consumed;
   } else if (idx < 209) {
-    // TWO (2) input code-code units
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
-    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
-    // correct for spurious high bit
-    const __m128i correct =
-        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
-    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
-    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
-                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
-    _mm_storeu_si128((__m128i *)utf32_output, composed);
-    utf32_output += 3;
+    // THREE (3) input code-code units
+    if (input_utf8_end_of_code_point_mask == 0x888) {
+      // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
+      // UTF-32 code units. This uses the same method as the fixed 3 byte
+      // version, reversing and shift left insert. However, there is no need for
+      // a shuffle mask now, just rev16 and rev32.
+      //
+      // This version does not use the LUT, but 4 byte sequences are less common
+      // and the overhead of the extra memory access is less important than the
+      // early branch overhead in shorter sequences, so it comes last.
+
+      // Swap pairs of bytes
+      // 10dddddd|10cccccc|10bbbbbb|11110aaa
+      // 10cccccc 10dddddd|11110aaa 10bbbbbb
+      __m128i swap = lsx_swap_bytes(in);
+      // Shift left and insert
+      // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb
+      __m128i merge1 = __lsx_vbitsel_v(__lsx_vsrli_h(swap, 2), swap,
+                                       __lsx_vrepli_h(0x3f /*0x003F*/));
+      // Shift insert again
+      // xxxxxxxx xxxaaabb bbbbcccc ccdddddd
+      __m128i merge2 =
+          __lsx_vbitsel_v(__lsx_vslli_w(merge1, 12), /* merge1 << 12 */
+                          __lsx_vsrli_w(merge1, 16), /* merge1 >> 16 */
+                          __lsx_vldi(-2545));        /*0x00000FFF*/
+      // Clear the garbage
+      // 00000000 000aaabb bbbbcccc ccdddddd
+      __m128i composed = __lsx_vand_v(merge2, __lsx_vldi(-2273 /*0x1FFFFF*/));
+      // Store
+      __lsx_vst(composed, utf32_output, 0);
+      utf32_output += 3; // We wrote 3 32-bit characters.
+      return 12;         // We consumed 12 bytes.
+    }
+    // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit
+    // due to surrogates no longer being involved.
+    __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+                               simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+                           0);
+    // 1 byte: 00000000 00000000 00000000 0ddddddd
+    // 2 byte: 00000000 00000000 110ccccc 10dddddd
+    // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
+    // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
+    sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+    __m128i perm = __lsx_vshuf_b(zero, in, sh);
+
+    // Ascii
+    __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F));
+    __m128i middle = __lsx_vand_v(perm, __lsx_vldi(-3777 /*0x00003f00*/));
+    // 00000000 00000000 0000cccc ccdddddd
+    __m128i cd =
+        __lsx_vbitsel_v(__lsx_vsrli_w(middle, 2), ascii, __lsx_vrepli_w(0x3f));
+
+    __m128i correction = __lsx_vand_v(perm, __lsx_vldi(-3520 /*0x00400000*/));
+    __m128i corrected = __lsx_vadd_b(perm, __lsx_vsrli_w(correction, 1));
+    // Insert twice
+    // 00000000 000aaabb bbbbxxxx xxxxxxxx
+    __m128i corrected_srli2 =
+        __lsx_vsrli_w(__lsx_vand_v(corrected, __lsx_vrepli_b(0x7)), 2);
+    __m128i ab =
+        __lsx_vbitsel_v(corrected_srli2, corrected, __lsx_vrepli_h(0x3f));
+    ab = __lsx_vsrli_w(ab, 4);
+    // 00000000 000aaabb bbbbcccc ccdddddd
+    __m128i composed =
+        __lsx_vbitsel_v(ab, cd, __lsx_vldi(-2545 /*0x00000FFF*/));
+    // Store
+    __lsx_vst(composed, utf32_output, 0);
+    utf32_output += 3; // We wrote 3 32-bit characters.
+    return consumed;
   } else {
     // here we know that there is an error but we do not handle errors
+    return 12;
   }
-  return consumed;
 }
-/* end file src/westmere/sse_convert_utf8_to_utf32.cpp */
-/* begin file src/westmere/sse_convert_utf8_to_latin1.cpp */
-// depends on "tables/utf8_to_utf16_tables.h"
-
-// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the
-// end of the code points. Only the least significant 12 bits of the mask
-// are accessed.
-// It returns how many bytes were consumed (up to 12).
+/* end file src/lasx/lasx_convert_utf8_to_utf32.cpp */
+/* begin file src/lasx/lasx_convert_utf8_to_latin1.cpp */
 size_t convert_masked_utf8_to_latin1(const char *input,
                                      uint64_t utf8_end_of_code_point_mask,
                                      char *&latin1_output) {
@@ -36094,27 +51047,30 @@ size_t convert_masked_utf8_to_latin1(const char *input,
   // Why 12 input bytes and not 16? Because we are concerned with the size of
   // the lookup tables. Also 12 is nicely divisible by two and three.
   //
-  //
+  __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
+
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xfff;
   // Optimization note: our main path below is load-latency dependent. Thus it
   // is maybe beneficial to have fast paths that depend on branch prediction but
   // have less latency. This results in more instructions but, potentially, also
   // higher speeds.
-  //
-  const __m128i in = _mm_loadu_si128((__m128i *)input);
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask &
-      0xfff; // we are only processing 12 bytes in case it is not all ASCII
-  if (utf8_end_of_code_point_mask == 0xfff) {
-    // We process the data in chunks of 12 bytes.
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in);
-    latin1_output += 12; // We wrote 12 characters.
-    return 12;           // We consumed 12 bytes.
+
+  // We first try a few fast paths.
+  // The obvious first test is ASCII, which actually consumes the full 16.
+  if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) {
+    // We process in chunks of 16 bytes
+    __lsx_vst(in, reinterpret_cast<uint8_t *>(latin1_output), 0);
+    latin1_output += 16; // We wrote 16 18-bit characters.
+    return 16;           // We consumed 16 bytes.
   }
-  /// We do not have a fast path available, so we fallback.
-  const uint8_t idx =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+  /// We do not have a fast path available, or the fast path is unimportant, so
+  /// we fallback.
+  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][0];
+
+  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][1];
   // this indicates an invalid input:
   if (idx >= 64) {
     return consumed;
@@ -36122,50 +51078,63 @@ size_t convert_masked_utf8_to_latin1(const char *input,
   // Here we should have (idx < 64), if not, there is a bug in the validation or
   // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
   // we process SIX (6) input code-code units. The max length in bytes of six
-  // code code units spanning between 1 and 2 bytes each is 12 bytes. On
-  // processors where pdep/pext is fast, we might be able to use a small lookup
-  // table.
-  const __m128i sh =
-      _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-  const __m128i perm = _mm_shuffle_epi8(in, sh);
-  const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-  const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-  __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-  const __m128i latin1_packed = _mm_packus_epi16(composed, composed);
+  // code code units spanning between 1 and 2 bytes each is 12 bytes. Converts 6
+  // 1-2 byte UTF-8 characters to 6 UTF-16 characters. This is a relatively easy
+  // scenario we process SIX (6) input code-code units. The max length in bytes
+  // of six code code units spanning between 1 and 2 bytes each is 12 bytes.
+  __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+                             simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+                         0);
+  // Shuffle
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 110aaaaa 10bbbbbb
+  sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+  __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh);
+  // ascii mask
+  // 1 byte: 11111111 11111111
+  // 2 byte: 00000000 00000000
+  __m128i ascii_mask = __lsx_vslt_bu(perm, __lsx_vldi(0x80));
+  // utf8 mask
+  // 1 byte: 00000000 00000000
+  // 2 byte: 00111111 00111111
+  __m128i utf8_mask = __lsx_vand_v(__lsx_vsle_bu(__lsx_vldi(0x80), perm),
+                                   __lsx_vldi(0b00111111));
+  // mask
+  //  1 byte: 11111111 11111111
+  //  2 byte: 00111111 00111111
+  __m128i mask = __lsx_vor_v(utf8_mask, ascii_mask);
+
+  __m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), perm, mask);
   // writing 8 bytes even though we only care about the first 6 bytes.
-  // performance note: it would be faster to use _mm_storeu_si128, we should
-  // investigate.
-  _mm_storel_epi64((__m128i *)latin1_output, latin1_packed);
+  __m128i latin1_packed = __lsx_vpickev_b(__lsx_vldi(0), composed);
+
+  __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
   latin1_output += 6; // We wrote 6 bytes.
   return consumed;
 }
-/* end file src/westmere/sse_convert_utf8_to_latin1.cpp */
+/* end file src/lasx/lasx_convert_utf8_to_latin1.cpp */
 
-/* begin file src/westmere/sse_convert_utf16_to_latin1.cpp */
+/* begin file src/lasx/lasx_convert_utf16_to_latin1.cpp */
 template <endianness big_endian>
 std::pair<const char16_t *, char *>
-sse_convert_utf16_to_latin1(const char16_t *buf, size_t len,
-                            char *latin1_output) {
+lasx_convert_utf16_to_latin1(const char16_t *buf, size_t len,
+                             char *latin1_output) {
   const char16_t *end = buf + len;
-  while (end - buf >= 8) {
-    // Load 8 UTF-16 characters into 128-bit SSE register
-    __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i *>(buf));
-
+  while (buf + 16 <= end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+    __m128i in1 = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
     if (!match_system(big_endian)) {
-      const __m128i swap =
-          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      in = _mm_shuffle_epi8(in, swap);
+      in = lsx_swap_bytes(in);
+      in1 = lsx_swap_bytes(in1);
     }
-
-    __m128i high_byte_mask = _mm_set1_epi16((int16_t)0xFF00);
-    if (_mm_testz_si128(in, high_byte_mask)) {
-      // Pack 16-bit characters into 8-bit and store in latin1_output
-      __m128i latin1_packed = _mm_packus_epi16(in, in);
-      _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
-                       latin1_packed);
-      // Adjust pointers for next iteration
-      buf += 8;
-      latin1_output += 8;
+    if (__lsx_bz_v(__lsx_vpickod_b(in1, in))) {
+      // 1. pack the bytes
+      __m128i latin1_packed = __lsx_vpickev_b(in1, in);
+      // 2. store (8 bytes)
+      __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+      // 3. adjust pointers
+      buf += 16;
+      latin1_output += 16;
     } else {
       return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
     }
@@ -36175,29 +51144,28 @@ sse_convert_utf16_to_latin1(const char16_t *buf, size_t len,
 
 template <endianness big_endian>
 std::pair<result, char *>
-sse_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
-                                        char *latin1_output) {
+lasx_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
+                                         char *latin1_output) {
   const char16_t *start = buf;
   const char16_t *end = buf + len;
-  while (end - buf >= 8) {
-    __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i *>(buf));
-
+  while (buf + 16 <= end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+    __m128i in1 = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
     if (!match_system(big_endian)) {
-      const __m128i swap =
-          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      in = _mm_shuffle_epi8(in, swap);
+      in = lsx_swap_bytes(in);
+      in1 = lsx_swap_bytes(in1);
     }
-
-    __m128i high_byte_mask = _mm_set1_epi16((int16_t)0xFF00);
-    if (_mm_testz_si128(in, high_byte_mask)) {
-      __m128i latin1_packed = _mm_packus_epi16(in, in);
-      _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
-                       latin1_packed);
-      buf += 8;
-      latin1_output += 8;
+    if (__lsx_bz_v(__lsx_vpickod_b(in1, in))) {
+      // 1. pack the bytes
+      __m128i latin1_packed = __lsx_vpickev_b(in1, in);
+      // 2. store (8 bytes)
+      __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+      // 3. adjust pointers
+      buf += 16;
+      latin1_output += 16;
     } else {
-      // Fallback to scalar code for handling errors
-      for (int k = 0; k < 8; k++) {
+      // Let us do a scalar fallback.
+      for (int k = 0; k < 16; k++) {
         uint16_t word = !match_system(big_endian)
                             ? scalar::utf16::swap_bytes(buf[k])
                             : buf[k];
@@ -36208,16 +51176,15 @@ sse_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
                                 latin1_output);
         }
       }
-      buf += 8;
     }
   } // while
   return std::make_pair(result(error_code::SUCCESS, buf - start),
                         latin1_output);
 }
-/* end file src/westmere/sse_convert_utf16_to_latin1.cpp */
-/* begin file src/westmere/sse_convert_utf16_to_utf8.cpp */
+/* end file src/lasx/lasx_convert_utf16_to_latin1.cpp */
+/* begin file src/lasx/lasx_convert_utf16_to_utf8.cpp */
 /*
-    The vectorized algorithm works on single SSE register i.e., it
+    The vectorized algorithm works on single LASX register i.e., it
     loads eight 16-bit code units.
 
     We consider three cases:
@@ -36231,11 +51198,11 @@ sse_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
     Ad 1.
 
     When values are less than 0x0800, it means that a 16-bit code unit
-    can be converted into: 1) single UTF8 byte (when it is an ASCII
+    can be converted into: 1) single UTF8 byte (when it's an ASCII
     char) or 2) two UTF8 bytes.
 
     For this case we do only some shuffle to obtain these 2-byte
-    codes and finally compress the whole SSE register with a single
+    codes and finally compress the whole LASX register with a single
     shuffle.
 
     We need 256-entry lookup table to get a compression pattern
@@ -36253,7 +51220,7 @@ sse_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
     the three-UTF8-bytes case.
 
     Finally these two registers are interleaved forming eight-element
-    array of 32-bit values. The array spans two SSE registers.
+    array of 32-bit values. The array spans two LASX registers.
     The bytes from the registers are compressed using two shuffles.
 
     We need 256-entry lookup table to get a compression pattern
@@ -36264,187 +51231,210 @@ sse_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
     To summarize:
     - We need two 256-entry tables that have 8704 bytes in total.
 */
-
 /*
   Returns a pair: the first unprocessed byte from buf and utf8_output
   A scalar routing should carry on the conversion of the tail.
 */
+
 template <endianness big_endian>
 std::pair<const char16_t *, char *>
-sse_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
-
+lasx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
   const char16_t *end = buf + len;
 
-  const __m128i v_0000 = _mm_setzero_si128();
-  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
-  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
   const size_t safety_margin =
       12; // to avoid overruns, see issue
           // https://github.com/simdutf/simdutf/issues/92
 
-  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
-    __m128i in = _mm_loadu_si128((__m128i *)buf);
-    if (big_endian) {
-      const __m128i swap =
-          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      in = _mm_shuffle_epi8(in, swap);
+  __m256i v_07ff = __lasx_xvreplgr2vr_h(uint16_t(0x7ff));
+  __m256i zero = __lasx_xvldi(0);
+  __m128i zero_128 = __lsx_vldi(0);
+  while (buf + 16 + safety_margin <= end) {
+    __m256i in = __lasx_xvld(reinterpret_cast<const uint16_t *>(buf), 0);
+    if (!match_system(big_endian)) {
+      in = lasx_swap_bytes(in);
     }
-    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
-    const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
-    if (_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
-      __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
-      if (big_endian) {
-        const __m128i swap =
-            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        nextin = _mm_shuffle_epi8(nextin, swap);
-      }
-      if (!_mm_testz_si128(nextin, v_ff80)) {
-        // 1. pack the bytes
-        // obviously suboptimal.
-        const __m128i utf8_packed = _mm_packus_epi16(in, in);
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 8;
-        utf8_output += 8;
-        in = nextin;
-      } else {
-        // 1. pack the bytes
-        // obviously suboptimal.
-        const __m128i utf8_packed = _mm_packus_epi16(in, nextin);
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 16;
-        utf8_output += 16;
-        continue; // we are done for this round!
-      }
+    if (__lasx_xbnz_h(__lasx_xvslt_hu(
+            in, __lasx_xvrepli_h(0x7F)))) { // ASCII fast path!!!!
+      // 1. pack the bytes
+      __m256i utf8_packed =
+          __lasx_xvpermi_d(__lasx_xvpickev_b(in, in), 0b00001000);
+      // 2. store (16 bytes)
+      __lsx_vst(lasx_extracti128_lo(utf8_packed), utf8_output, 0);
+      // 3. adjust pointers
+      buf += 16;
+      utf8_output += 16;
+      continue; // we are done for this round!
     }
 
-    // no bits set above 7th bit
-    const __m128i one_byte_bytemask =
-        _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
-    const uint16_t one_byte_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
+    if (__lasx_xbz_v(__lasx_xvslt_hu(v_07ff, in))) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 16
+      // expected output   : [110a|aaaa|10bb|bbbb] x 16
+      // t0 = [000a|aaaa|bbbb|bb00]
+      __m256i t0 = __lasx_xvslli_h(in, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785 /*0x1f00*/));
+      // t2 = [0000|0000|00bb|bbbb]
+      __m256i t2 = __lasx_xvand_v(in, __lasx_xvrepli_h(0x3f));
+      // t3 = [000a|aaaa|00bb|bbbb]
+      __m256i t3 = __lasx_xvor_v(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      __m256i v_c080 = __lasx_xvreplgr2vr_h(uint16_t(0xc080));
+      __m256i t4 = __lasx_xvor_v(t3, v_c080);
+      // 2. merge ASCII and 2-byte codewords
+      __m256i one_byte_bytemask =
+          __lasx_xvsle_hu(in, __lasx_xvrepli_h(0x7F /*0x007F*/));
+      __m256i utf8_unpacked = __lasx_xvbitsel_v(t4, in, one_byte_bytemask);
+      // 3. prepare bitmask for 8-bit lookup
+      __m256i mask = __lasx_xvmskltz_h(one_byte_bytemask);
+      uint32_t m1 = __lasx_xvpickve2gr_wu(mask, 0);
+      uint32_t m2 = __lasx_xvpickve2gr_wu(mask, 4);
+      // 4. pack the bytes
+      const uint8_t *row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                                [lasx_1_2_utf8_bytes_mask[m1]][0];
+      __m128i shuffle1 = __lsx_vld(row1, 1);
+      __m128i utf8_packed1 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_lo(utf8_unpacked), shuffle1);
+
+      const uint8_t *row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                                [lasx_1_2_utf8_bytes_mask[m2]][0];
+      __m128i shuffle2 = __lsx_vld(row2, 1);
+      __m128i utf8_packed2 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_hi(utf8_unpacked), shuffle2);
+      // 5. store bytes
+      __lsx_vst(utf8_packed1, utf8_output, 0);
+      utf8_output += row1[0];
 
-    // no bits set above 11th bit
-    const __m128i one_or_two_bytes_bytemask =
-        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
-    const uint16_t one_or_two_bytes_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+      __lsx_vst(utf8_packed2, utf8_output, 0);
+      utf8_output += row2[0];
 
-    if (one_or_two_bytes_bitmask == 0xffff) {
-      internal::westmere::write_v_u16_11bits_to_utf8(
-          in, utf8_output, one_byte_bytemask, one_byte_bitmask);
-      buf += 8;
+      buf += 16;
       continue;
     }
-
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m128i surrogates_bytemask =
-        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
-
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint16_t surrogates_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+    __m256i surrogates_bytemask =
+        __lasx_xvseq_h(__lasx_xvand_v(in, __lasx_xvldi(-2568 /*0xF800*/)),
+                       __lasx_xvldi(-2600 /*0xD800*/));
     // It might seem like checking for surrogates_bitmask == 0xc000 could help.
     // However, it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x0000) {
+    if (__lasx_xbz_v(surrogates_bytemask)) {
       // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
-      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
       /* In this branch we handle three cases:
-         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
-        single UFT-8 byte
-         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
-        UTF-8 bytes
-         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
-        three UTF-8 bytes
+           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+         single UFT-8 byte
+           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
+         two UTF-8 bytes
+           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+         three UTF-8 bytes
 
-        We expand the input word (16-bit) into two code units (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
+          We expand the input word (16-bit) into two code units (32-bit), thus
+          we have room for four bytes. However, we need five distinct bit
+          layouts. Note that the last byte in cases #2 and #3 is the same.
 
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
+          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+          in register t2.
 
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
+          We precompute byte 1 for case #3 and -- **conditionally** --
+         precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+         they differ by exactly one bit.
 
-        Finally from these two code units we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
+          Finally from these two code units we build proper UTF-8 sequence,
+         taking into account the case (i.e, the number of bytes to write).
+        */
       /**
        * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
        * t2 => [0ccc|cccc] [10cc|cccc]
        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
        */
-#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
+      __m256i t0 = __lasx_xvpickev_b(in, in);
+      t0 = __lasx_xvilvl_b(t0, t0);
+
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|00cc|cccc]
+      __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F));
+      __m256i t1 = __lasx_xvand_v(t0, v_3f7f);
       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
+      __m256i t2 = __lasx_xvor_v(t1, __lasx_xvldi(-2688));
 
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m128i s0 = _mm_srli_epi16(in, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
-      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
-                                          simdutf_vec(0b0100000000000000));
-      const __m128i s4 = _mm_xor_si128(s3, m0);
-#undef simdutf_vec
+      // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+      __m256i s0 = __lasx_xvsrli_h(in, 12);
+      // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+      __m256i s1 = __lasx_xvslli_h(in, 2);
+      // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000]
+      s1 = __lasx_xvand_v(s1, __lasx_xvldi(-2753 /*0x3F00*/));
+
+      // [00bb|bbbb|0000|aaaa]
+      __m256i s2 = __lasx_xvor_v(s0, s1);
+      // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      __m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0));
+      __m256i s3 = __lasx_xvor_v(s2, v_c0e0);
+      __m256i one_or_two_bytes_bytemask = __lasx_xvsle_hu(in, v_07ff);
+      __m256i m0 = __lasx_xvandn_v(one_or_two_bytes_bytemask,
+                                   __lasx_xvldi(-2752 /*0x4000*/));
+      __m256i s4 = __lasx_xvxor_v(s3, m0);
 
       // 4. expand code units 16-bit => 32-bit
-      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
-      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+      __m256i out0 = __lasx_xvilvl_h(s4, t2);
+      __m256i out1 = __lasx_xvilvh_h(s4, t2);
 
       // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint16_t mask =
-          (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
-      if (mask == 0) {
-        // We only have three-byte code units. Use fast path.
-        const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
-                                              15, 13, -1, -1, -1, -1);
-        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
-        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
-        utf8_output += 12;
-        buf += 8;
-        continue;
-      }
-      const uint8_t mask0 = uint8_t(mask);
-
+      __m256i one_byte_bytemask = __lasx_xvsle_hu(in, __lasx_xvrepli_h(0x7F));
+      __m256i one_byte_bytemask_low =
+          __lasx_xvilvl_h(one_byte_bytemask, one_byte_bytemask);
+      __m256i one_byte_bytemask_high =
+          __lasx_xvilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+      __m256i one_or_two_bytes_bytemask_low =
+          __lasx_xvilvl_h(one_or_two_bytes_bytemask, zero);
+      __m256i one_or_two_bytes_bytemask_high =
+          __lasx_xvilvh_h(one_or_two_bytes_bytemask, zero);
+
+      __m256i mask0 = __lasx_xvmskltz_h(
+          __lasx_xvor_v(one_or_two_bytes_bytemask_low, one_byte_bytemask_low));
+      __m256i mask1 = __lasx_xvmskltz_h(__lasx_xvor_v(
+          one_or_two_bytes_bytemask_high, one_byte_bytemask_high));
+
+      uint32_t mask = __lasx_xvpickve2gr_wu(mask0, 0);
       const uint8_t *row0 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
-      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
-
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                [0];
+      __m128i shuffle0 = __lsx_vld(row0, 1);
+      __m128i utf8_0 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out0), shuffle0);
+      __lsx_vst(utf8_0, utf8_output, 0);
+      utf8_output += row0[0];
 
+      mask = __lasx_xvpickve2gr_wu(mask1, 0);
       const uint8_t *row1 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
-      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
-
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                [0];
+      __m128i shuffle1 = __lsx_vld(row1, 1);
+      __m128i utf8_1 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out1), shuffle1);
+      __lsx_vst(utf8_1, utf8_output, 0);
       utf8_output += row1[0];
 
-      buf += 8;
+      mask = __lasx_xvpickve2gr_wu(mask0, 4);
+      const uint8_t *row2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                [0];
+      __m128i shuffle2 = __lsx_vld(row2, 1);
+      __m128i utf8_2 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out0), shuffle2);
+      __lsx_vst(utf8_2, utf8_output, 0);
+      utf8_output += row2[0];
+
+      mask = __lasx_xvpickve2gr_wu(mask1, 4);
+      const uint8_t *row3 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                [0];
+      __m128i shuffle3 = __lsx_vld(row3, 1);
+      __m128i utf8_3 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out1), shuffle3);
+      __lsx_vst(utf8_3, utf8_output, 0);
+      utf8_output += row3[0];
+
+      buf += 16;
       // surrogate pair(s) in a register
     } else {
       // Let us do a scalar fallback.
@@ -36456,7 +51446,9 @@ sse_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
         forward = size_t(end - buf - 1);
       }
       for (; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
         if ((word & 0xFF80) == 0) {
           *utf8_output++ = char(word);
         } else if ((word & 0xF800) == 0) {
@@ -36469,12 +51461,14 @@ sse_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
         } else {
           // must be a surrogate pair
           uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word =
-              big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
           k++;
           uint16_t diff2 = uint16_t(next_word - 0xDC00);
           if ((diff | diff2) > 0x3FF) {
-            return std::make_pair(nullptr, utf8_output);
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char *>(utf8_output));
           }
           uint32_t value = (diff << 10) + diff2 + 0x10000;
           *utf8_output++ = char((value >> 18) | 0b11110000);
@@ -36486,8 +51480,7 @@ sse_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
       buf += k;
     }
   } // while
-
-  return std::make_pair(buf, utf8_output);
+  return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
 }
 
 /*
@@ -36499,181 +51492,205 @@ sse_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
 */
 template <endianness big_endian>
 std::pair<result, char *>
-sse_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
-                                      char *utf8_output) {
+lasx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
+                                       char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
   const char16_t *start = buf;
   const char16_t *end = buf + len;
 
-  const __m128i v_0000 = _mm_setzero_si128();
-  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
-  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
   const size_t safety_margin =
       12; // to avoid overruns, see issue
           // https://github.com/simdutf/simdutf/issues/92
 
-  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
-    __m128i in = _mm_loadu_si128((__m128i *)buf);
-    if (big_endian) {
-      const __m128i swap =
-          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      in = _mm_shuffle_epi8(in, swap);
+  __m256i v_07ff = __lasx_xvreplgr2vr_h(uint16_t(0x7ff));
+  __m256i zero = __lasx_xvldi(0);
+  __m128i zero_128 = __lsx_vldi(0);
+  while (buf + 16 + safety_margin <= end) {
+    __m256i in = __lasx_xvld(reinterpret_cast<const uint16_t *>(buf), 0);
+    if (!match_system(big_endian)) {
+      in = lasx_swap_bytes(in);
     }
-    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
-    const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
-    if (_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
-      __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
-      if (big_endian) {
-        const __m128i swap =
-            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        nextin = _mm_shuffle_epi8(nextin, swap);
-      }
-      if (!_mm_testz_si128(nextin, v_ff80)) {
-        // 1. pack the bytes
-        // obviously suboptimal.
-        const __m128i utf8_packed = _mm_packus_epi16(in, in);
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 8;
-        utf8_output += 8;
-        in = nextin;
-      } else {
-        // 1. pack the bytes
-        // obviously suboptimal.
-        const __m128i utf8_packed = _mm_packus_epi16(in, nextin);
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 16;
-        utf8_output += 16;
-        continue; // we are done for this round!
-      }
+    if (__lasx_xbnz_h(__lasx_xvslt_hu(
+            in, __lasx_xvrepli_h(0x7F)))) { // ASCII fast path!!!!
+      // 1. pack the bytes
+      __m256i utf8_packed =
+          __lasx_xvpermi_d(__lasx_xvpickev_b(in, in), 0b00001000);
+      // 2. store (16 bytes)
+      __lsx_vst(lasx_extracti128_lo(utf8_packed), utf8_output, 0);
+      // 3. adjust pointers
+      buf += 16;
+      utf8_output += 16;
+      continue; // we are done for this round!
     }
 
-    // no bits set above 7th bit
-    const __m128i one_byte_bytemask =
-        _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
-    const uint16_t one_byte_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
+    if (__lasx_xbz_v(__lasx_xvslt_hu(v_07ff, in))) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 16
+      // expected output   : [110a|aaaa|10bb|bbbb] x 16
+      // t0 = [000a|aaaa|bbbb|bb00]
+      __m256i t0 = __lasx_xvslli_h(in, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785 /*0x1f00*/));
+      // t2 = [0000|0000|00bb|bbbb]
+      __m256i t2 = __lasx_xvand_v(in, __lasx_xvrepli_h(0x3f));
+      // t3 = [000a|aaaa|00bb|bbbb]
+      __m256i t3 = __lasx_xvor_v(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      __m256i v_c080 = __lasx_xvreplgr2vr_h(uint16_t(0xc080));
+      __m256i t4 = __lasx_xvor_v(t3, v_c080);
+      // 2. merge ASCII and 2-byte codewords
+      __m256i one_byte_bytemask =
+          __lasx_xvsle_hu(in, __lasx_xvrepli_h(0x7F /*0x007F*/));
+      __m256i utf8_unpacked = __lasx_xvbitsel_v(t4, in, one_byte_bytemask);
+      // 3. prepare bitmask for 8-bit lookup
+      __m256i mask = __lasx_xvmskltz_h(one_byte_bytemask);
+      uint32_t m1 = __lasx_xvpickve2gr_wu(mask, 0);
+      uint32_t m2 = __lasx_xvpickve2gr_wu(mask, 4);
+      // 4. pack the bytes
+      const uint8_t *row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                                [lasx_1_2_utf8_bytes_mask[m1]][0];
+      __m128i shuffle1 = __lsx_vld(row1, 1);
+      __m128i utf8_packed1 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_lo(utf8_unpacked), shuffle1);
+
+      const uint8_t *row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                                [lasx_1_2_utf8_bytes_mask[m2]][0];
+      __m128i shuffle2 = __lsx_vld(row2, 1);
+      __m128i utf8_packed2 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_hi(utf8_unpacked), shuffle2);
+      // 5. store bytes
+      __lsx_vst(utf8_packed1, utf8_output, 0);
+      utf8_output += row1[0];
 
-    // no bits set above 11th bit
-    const __m128i one_or_two_bytes_bytemask =
-        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
-    const uint16_t one_or_two_bytes_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+      __lsx_vst(utf8_packed2, utf8_output, 0);
+      utf8_output += row2[0];
 
-    if (one_or_two_bytes_bitmask == 0xffff) {
-      internal::westmere::write_v_u16_11bits_to_utf8(
-          in, utf8_output, one_byte_bytemask, one_byte_bitmask);
-      buf += 8;
+      buf += 16;
       continue;
     }
-
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m128i surrogates_bytemask =
-        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
-
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint16_t surrogates_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+    __m256i surrogates_bytemask =
+        __lasx_xvseq_h(__lasx_xvand_v(in, __lasx_xvldi(-2568 /*0xF800*/)),
+                       __lasx_xvldi(-2600 /*0xD800*/));
     // It might seem like checking for surrogates_bitmask == 0xc000 could help.
     // However, it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x0000) {
+    if (__lasx_xbz_v(surrogates_bytemask)) {
       // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
-      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
       /* In this branch we handle three cases:
-         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
-        single UFT-8 byte
-         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
-        UTF-8 bytes
-         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
-        three UTF-8 bytes
+           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+         single UFT-8 byte
+           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
+         two UTF-8 bytes
+           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+         three UTF-8 bytes
 
-        We expand the input word (16-bit) into two code units (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
+          We expand the input word (16-bit) into two code units (32-bit), thus
+          we have room for four bytes. However, we need five distinct bit
+          layouts. Note that the last byte in cases #2 and #3 is the same.
 
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
+          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+          in register t2.
 
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
+          We precompute byte 1 for case #3 and -- **conditionally** --
+         precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+         they differ by exactly one bit.
 
-        Finally from these two code units we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
+          Finally from these two code units we build proper UTF-8 sequence,
+         taking into account the case (i.e, the number of bytes to write).
+        */
       /**
        * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
        * t2 => [0ccc|cccc] [10cc|cccc]
        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
        */
-#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
+      __m256i t0 = __lasx_xvpickev_b(in, in);
+      t0 = __lasx_xvilvl_b(t0, t0);
+
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|00cc|cccc]
+      __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F));
+      __m256i t1 = __lasx_xvand_v(t0, v_3f7f);
       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
+      __m256i t2 = __lasx_xvor_v(t1, __lasx_xvldi(-2688));
 
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m128i s0 = _mm_srli_epi16(in, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
-      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
-                                          simdutf_vec(0b0100000000000000));
-      const __m128i s4 = _mm_xor_si128(s3, m0);
-#undef simdutf_vec
+      // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+      __m256i s0 = __lasx_xvsrli_h(in, 12);
+      // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+      __m256i s1 = __lasx_xvslli_h(in, 2);
+      // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000]
+      s1 = __lasx_xvand_v(s1, __lasx_xvldi(-2753 /*0x3F00*/));
+
+      // [00bb|bbbb|0000|aaaa]
+      __m256i s2 = __lasx_xvor_v(s0, s1);
+      // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      __m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0));
+      __m256i s3 = __lasx_xvor_v(s2, v_c0e0);
+      __m256i one_or_two_bytes_bytemask = __lasx_xvsle_hu(in, v_07ff);
+      __m256i m0 = __lasx_xvandn_v(one_or_two_bytes_bytemask,
+                                   __lasx_xvldi(-2752 /*0x4000*/));
+      __m256i s4 = __lasx_xvxor_v(s3, m0);
 
       // 4. expand code units 16-bit => 32-bit
-      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
-      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+      __m256i out0 = __lasx_xvilvl_h(s4, t2);
+      __m256i out1 = __lasx_xvilvh_h(s4, t2);
 
       // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint16_t mask =
-          (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
-      if (mask == 0) {
-        // We only have three-byte code units. Use fast path.
-        const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
-                                              15, 13, -1, -1, -1, -1);
-        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
-        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
-        utf8_output += 12;
-        buf += 8;
-        continue;
-      }
-      const uint8_t mask0 = uint8_t(mask);
-
+      __m256i one_byte_bytemask = __lasx_xvsle_hu(in, __lasx_xvrepli_h(0x7F));
+      __m256i one_byte_bytemask_low =
+          __lasx_xvilvl_h(one_byte_bytemask, one_byte_bytemask);
+      __m256i one_byte_bytemask_high =
+          __lasx_xvilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+      __m256i one_or_two_bytes_bytemask_low =
+          __lasx_xvilvl_h(one_or_two_bytes_bytemask, zero);
+      __m256i one_or_two_bytes_bytemask_high =
+          __lasx_xvilvh_h(one_or_two_bytes_bytemask, zero);
+
+      __m256i mask0 = __lasx_xvmskltz_h(
+          __lasx_xvor_v(one_or_two_bytes_bytemask_low, one_byte_bytemask_low));
+      __m256i mask1 = __lasx_xvmskltz_h(__lasx_xvor_v(
+          one_or_two_bytes_bytemask_high, one_byte_bytemask_high));
+
+      uint32_t mask = __lasx_xvpickve2gr_wu(mask0, 0);
       const uint8_t *row0 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
-      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
-
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                [0];
+      __m128i shuffle0 = __lsx_vld(row0, 1);
+      __m128i utf8_0 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out0), shuffle0);
+      __lsx_vst(utf8_0, utf8_output, 0);
+      utf8_output += row0[0];
 
+      mask = __lasx_xvpickve2gr_wu(mask1, 0);
       const uint8_t *row1 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
-      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
-
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                [0];
+      __m128i shuffle1 = __lsx_vld(row1, 1);
+      __m128i utf8_1 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out1), shuffle1);
+      __lsx_vst(utf8_1, utf8_output, 0);
       utf8_output += row1[0];
 
-      buf += 8;
+      mask = __lasx_xvpickve2gr_wu(mask0, 4);
+      const uint8_t *row2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                [0];
+      __m128i shuffle2 = __lsx_vld(row2, 1);
+      __m128i utf8_2 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out0), shuffle2);
+      __lsx_vst(utf8_2, utf8_output, 0);
+      utf8_output += row2[0];
+
+      mask = __lasx_xvpickve2gr_wu(mask1, 4);
+      const uint8_t *row3 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                [0];
+      __m128i shuffle3 = __lsx_vld(row3, 1);
+      __m128i utf8_3 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out1), shuffle3);
+      __lsx_vst(utf8_3, utf8_output, 0);
+      utf8_output += row3[0];
+
+      buf += 16;
       // surrogate pair(s) in a register
     } else {
       // Let us do a scalar fallback.
@@ -36685,7 +51702,9 @@ sse_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
         forward = size_t(end - buf - 1);
       }
       for (; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
         if ((word & 0xFF80) == 0) {
           *utf8_output++ = char(word);
         } else if ((word & 0xF800) == 0) {
@@ -36698,14 +51717,15 @@ sse_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
         } else {
           // must be a surrogate pair
           uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word =
-              big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
           k++;
           uint16_t diff2 = uint16_t(next_word - 0xDC00);
           if ((diff | diff2) > 0x3FF) {
             return std::make_pair(
                 result(error_code::SURROGATE, buf - start + k - 1),
-                utf8_output);
+                reinterpret_cast<char *>(utf8_output));
           }
           uint32_t value = (diff << 10) + diff2 + 0x10000;
           *utf8_output++ = char((value >> 18) | 0b11110000);
@@ -36718,101 +51738,67 @@ sse_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
     }
   } // while
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
-}
-/* end file src/westmere/sse_convert_utf16_to_utf8.cpp */
-/* begin file src/westmere/sse_convert_utf16_to_utf32.cpp */
-/*
-    The vectorized algorithm works on single SSE register i.e., it
-    loads eight 16-bit code units.
-
-    We consider three cases:
-    1. an input register contains no surrogates and each value
-       is in range 0x0000 .. 0x07ff.
-    2. an input register contains no surrogates and values are
-       is in range 0x0000 .. 0xffff.
-    3. an input register contains surrogates --- i.e. codepoints
-       can have 16 or 32 bits.
-
-    Ad 1.
-
-    When values are less than 0x0800, it means that a 16-bit code unit
-    can be converted into: 1) single UTF8 byte (when it's an ASCII
-    char) or 2) two UTF8 bytes.
-
-    For this case we do only some shuffle to obtain these 2-byte
-    codes and finally compress the whole SSE register with a single
-    shuffle.
-
-    We need 256-entry lookup table to get a compression pattern
-    and the number of output bytes in the compressed vector register.
-    Each entry occupies 17 bytes.
-
-    Ad 2.
-
-    When values fit in 16-bit code units, but are above 0x07ff, then
-    a single word may produce one, two or three UTF8 bytes.
-
-    We prepare data for all these three cases in two registers.
-    The first register contains lower two UTF8 bytes (used in all
-    cases), while the second one contains just the third byte for
-    the three-UTF8-bytes case.
-
-    Finally these two registers are interleaved forming eight-element
-    array of 32-bit values. The array spans two SSE registers.
-    The bytes from the registers are compressed using two shuffles.
-
-    We need 256-entry lookup table to get a compression pattern
-    and the number of output bytes in the compressed vector register.
-    Each entry occupies 17 bytes.
-
-
-    To summarize:
-    - We need two 256-entry tables that have 8704 bytes in total.
-*/
-
-/*
-  Returns a pair: the first unprocessed byte from buf and utf8_output
-  A scalar routing should carry on the conversion of the tail.
-*/
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char *>(utf8_output));
+}
+/* end file src/lasx/lasx_convert_utf16_to_utf8.cpp */
+/* begin file src/lasx/lasx_convert_utf16_to_utf32.cpp */
 template <endianness big_endian>
 std::pair<const char16_t *, char32_t *>
-sse_convert_utf16_to_utf32(const char16_t *buf, size_t len,
-                           char32_t *utf32_output) {
+lasx_convert_utf16_to_utf32(const char16_t *buf, size_t len,
+                            char32_t *utf32_out) {
+  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
   const char16_t *end = buf + len;
 
-  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
-  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
+  // Performance degradation when memory address is not 32-byte aligned
+  while (((uint64_t)utf32_output & 0x1f) && buf < end) {
+    uint16_t word =
+        !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[0]) : buf[0];
+    if ((word & 0xF800) != 0xD800) {
+      *utf32_output++ = char32_t(word);
+      buf++;
+    } else {
+      if (buf + 1 >= end) {
+        return std::make_pair(nullptr,
+                              reinterpret_cast<char32_t *>(utf32_output));
+      }
+      // must be a surrogate pair
+      uint16_t diff = uint16_t(word - 0xD800);
+      uint16_t next_word = !match_system(big_endian)
+                               ? scalar::utf16::swap_bytes(buf[1])
+                               : buf[1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      if ((diff | diff2) > 0x3FF) {
+        return std::make_pair(nullptr,
+                              reinterpret_cast<char32_t *>(utf32_output));
+      }
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      *utf32_output++ = char32_t(value);
+      buf += 2;
+    }
+  }
 
-  while (end - buf >= 8) {
-    __m128i in = _mm_loadu_si128((__m128i *)buf);
+  __m256i v_f800 = __lasx_xvldi(-2568); /*0xF800*/
+  __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
 
-    if (big_endian) {
-      const __m128i swap =
-          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      in = _mm_shuffle_epi8(in, swap);
+  while (buf + 16 <= end) {
+    __m256i in = __lasx_xvld(reinterpret_cast<const uint16_t *>(buf), 0);
+    if (!match_system(big_endian)) {
+      in = lasx_swap_bytes(in);
     }
 
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m128i surrogates_bytemask =
-        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
-
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint16_t surrogates_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+    __m256i surrogates_bytemask =
+        __lasx_xvseq_h(__lasx_xvand_v(in, v_f800), v_d800);
     // It might seem like checking for surrogates_bitmask == 0xc000 could help.
     // However, it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x0000) {
-      // case: no surrogate pair, extend 16-bit code units to 32-bit code units
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
-                       _mm_cvtepu16_epi32(in));
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
-                       _mm_cvtepu16_epi32(_mm_srli_si128(in, 8)));
-      utf32_output += 8;
-      buf += 8;
+    if (__lasx_xbz_v(surrogates_bytemask)) {
+      // case: no surrogate pairs, extend all 16-bit code units to 32-bit code
+      // units
+      __m256i in_hi = __lasx_xvpermi_q(in, in, 0b00000001);
+      __lasx_xvst(__lasx_vext2xv_wu_hu(in), utf32_output, 0);
+      __lasx_xvst(__lasx_vext2xv_wu_hu(in_hi), utf32_output, 32);
+      utf32_output += 16;
+      buf += 16;
       // surrogate pair(s) in a register
     } else {
       // Let us do a scalar fallback.
@@ -36824,18 +51810,22 @@ sse_convert_utf16_to_utf32(const char16_t *buf, size_t len,
         forward = size_t(end - buf - 1);
       }
       for (; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
         if ((word & 0xF800) != 0xD800) {
           *utf32_output++ = char32_t(word);
         } else {
           // must be a surrogate pair
           uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word =
-              big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
           k++;
           uint16_t diff2 = uint16_t(next_word - 0xDC00);
           if ((diff | diff2) > 0x3FF) {
-            return std::make_pair(nullptr, utf32_output);
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char32_t *>(utf32_output));
           }
           uint32_t value = (diff << 10) + diff2 + 0x10000;
           *utf32_output++ = char32_t(value);
@@ -36844,7 +51834,7 @@ sse_convert_utf16_to_utf32(const char16_t *buf, size_t len,
       buf += k;
     }
   } // while
-  return std::make_pair(buf, utf32_output);
+  return std::make_pair(buf, reinterpret_cast<char32_t *>(utf32_output));
 }
 
 /*
@@ -36856,43 +51846,59 @@ sse_convert_utf16_to_utf32(const char16_t *buf, size_t len,
 */
 template <endianness big_endian>
 std::pair<result, char32_t *>
-sse_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
-                                       char32_t *utf32_output) {
+lasx_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
+                                        char32_t *utf32_out) {
+  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
   const char16_t *start = buf;
   const char16_t *end = buf + len;
 
-  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
-  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
-
-  while (end - buf >= 8) {
-    __m128i in = _mm_loadu_si128((__m128i *)buf);
-
-    if (big_endian) {
-      const __m128i swap =
-          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      in = _mm_shuffle_epi8(in, swap);
+  // Performance degradation when memory address is not 32-byte aligned
+  while (((uint64_t)utf32_output & 0x1f) && buf < end) {
+    uint16_t word =
+        !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[0]) : buf[0];
+    if ((word & 0xF800) != 0xD800) {
+      *utf32_output++ = char32_t(word);
+      buf++;
+    } else if (buf + 1 < end) {
+      // must be a surrogate pair
+      uint16_t diff = uint16_t(word - 0xD800);
+      uint16_t next_word = !match_system(big_endian)
+                               ? scalar::utf16::swap_bytes(buf[1])
+                               : buf[1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      if ((diff | diff2) > 0x3FF) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start),
+                              reinterpret_cast<char32_t *>(utf32_output));
+      }
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      *utf32_output++ = char32_t(value);
+      buf += 2;
+    } else {
+      return std::make_pair(result(error_code::SURROGATE, buf - start),
+                            reinterpret_cast<char32_t *>(utf32_output));
     }
+  }
 
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m128i surrogates_bytemask =
-        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
+  __m256i v_f800 = __lasx_xvldi(-2568); /*0xF800*/
+  __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
+  while (buf + 16 <= end) {
+    __m256i in = __lasx_xvld(reinterpret_cast<const uint16_t *>(buf), 0);
+    if (!match_system(big_endian)) {
+      in = lasx_swap_bytes(in);
+    }
 
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint16_t surrogates_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+    __m256i surrogates_bytemask =
+        __lasx_xvseq_h(__lasx_xvand_v(in, v_f800), v_d800);
     // It might seem like checking for surrogates_bitmask == 0xc000 could help.
     // However, it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x0000) {
-      // case: no surrogate pair, extend 16-bit code units to 32-bit code units
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
-                       _mm_cvtepu16_epi32(in));
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
-                       _mm_cvtepu16_epi32(_mm_srli_si128(in, 8)));
-      utf32_output += 8;
-      buf += 8;
+    if (__lasx_xbz_v(surrogates_bytemask)) {
+      // case: no surrogate pairs, extend all 16-bit code units to 32-bit code
+      // units
+      __m256i in_hi = __lasx_xvpermi_q(in, in, 0b00000001);
+      __lasx_xvst(__lasx_vext2xv_wu_hu(in), utf32_output, 0);
+      __lasx_xvst(__lasx_vext2xv_wu_hu(in_hi), utf32_output, 32);
+      utf32_output += 16;
+      buf += 16;
       // surrogate pair(s) in a register
     } else {
       // Let us do a scalar fallback.
@@ -36904,20 +51910,23 @@ sse_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
         forward = size_t(end - buf - 1);
       }
       for (; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
         if ((word & 0xF800) != 0xD800) {
           *utf32_output++ = char32_t(word);
         } else {
           // must be a surrogate pair
           uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word =
-              big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
           k++;
           uint16_t diff2 = uint16_t(next_word - 0xDC00);
           if ((diff | diff2) > 0x3FF) {
             return std::make_pair(
                 result(error_code::SURROGATE, buf - start + k - 1),
-                utf32_output);
+                reinterpret_cast<char32_t *>(utf32_output));
           }
           uint32_t value = (diff << 10) + diff2 + 0x10000;
           *utf32_output++ = char32_t(value);
@@ -36926,379 +51935,337 @@ sse_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
       buf += k;
     }
   } // while
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char32_t *>(utf32_output));
 }
-/* end file src/westmere/sse_convert_utf16_to_utf32.cpp */
+/* end file src/lasx/lasx_convert_utf16_to_utf32.cpp */
 
-/* begin file src/westmere/sse_convert_utf32_to_latin1.cpp */
+/* begin file src/lasx/lasx_convert_utf32_to_latin1.cpp */
 std::pair<const char32_t *, char *>
-sse_convert_utf32_to_latin1(const char32_t *buf, size_t len,
-                            char *latin1_output) {
-  const size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
-
-  __m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00);
-  __m128i shufmask =
-      _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
-
-  for (size_t i = 0; i < rounded_len; i += 16) {
-    __m128i in1 = _mm_loadu_si128((__m128i *)buf);
-    __m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4));
-    __m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8));
-    __m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12));
+lasx_convert_utf32_to_latin1(const char32_t *buf, size_t len,
+                             char *latin1_output) {
+  const char32_t *end = buf + len;
+  const __m256i shuf_mask = ____m256i(
+      (__m128i)v16u8{0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0});
+  __m256i v_ff = __lasx_xvrepli_w(0xFF);
 
-    __m128i check_combined = _mm_or_si128(in1, in2);
-    check_combined = _mm_or_si128(check_combined, in3);
-    check_combined = _mm_or_si128(check_combined, in4);
+  while (buf + 16 <= end) {
+    __m256i in1 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m256i in2 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
 
-    if (!_mm_testz_si128(check_combined, high_bytes_mask)) {
-      return std::make_pair(nullptr, latin1_output);
+    __m256i in12 = __lasx_xvor_v(in1, in2);
+    if (__lasx_xbz_v(__lasx_xvslt_wu(v_ff, in12))) {
+      // 1. pack the bytes
+      __m256i latin1_packed_tmp = __lasx_xvshuf_b(in2, in1, shuf_mask);
+      latin1_packed_tmp = __lasx_xvpermi_d(latin1_packed_tmp, 0b00001000);
+      __m128i latin1_packed = lasx_extracti128_lo(latin1_packed_tmp);
+      latin1_packed = __lsx_vpermi_w(latin1_packed, latin1_packed, 0b11011000);
+      // 2. store (8 bytes)
+      __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+      // 3. adjust pointers
+      buf += 16;
+      latin1_output += 16;
+    } else {
+      return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
     }
-    __m128i pack1 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in1, shufmask),
-                                       _mm_shuffle_epi8(in2, shufmask));
-    __m128i pack2 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in3, shufmask),
-                                       _mm_shuffle_epi8(in4, shufmask));
-    __m128i pack = _mm_unpacklo_epi64(pack1, pack2);
-    _mm_storeu_si128((__m128i *)latin1_output, pack);
-    latin1_output += 16;
-    buf += 16;
-  }
-
+  } // while
   return std::make_pair(buf, latin1_output);
 }
 
 std::pair<result, char *>
-sse_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
-                                        char *latin1_output) {
+lasx_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+                                         char *latin1_output) {
   const char32_t *start = buf;
-  const size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
-
-  __m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00);
-  __m128i shufmask =
-      _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
+  const char32_t *end = buf + len;
 
-  for (size_t i = 0; i < rounded_len; i += 16) {
-    __m128i in1 = _mm_loadu_si128((__m128i *)buf);
-    __m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4));
-    __m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8));
-    __m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12));
+  const __m256i shuf_mask = ____m256i(
+      (__m128i)v16u8{0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0});
+  __m256i v_ff = __lasx_xvrepli_w(0xFF);
 
-    __m128i check_combined = _mm_or_si128(in1, in2);
-    check_combined = _mm_or_si128(check_combined, in3);
-    check_combined = _mm_or_si128(check_combined, in4);
+  while (buf + 16 <= end) {
+    __m256i in1 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m256i in2 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
 
-    if (!_mm_testz_si128(check_combined, high_bytes_mask)) {
-      // Fallback to scalar code for handling errors
+    __m256i in12 = __lasx_xvor_v(in1, in2);
+    if (__lasx_xbz_v(__lasx_xvslt_wu(v_ff, in12))) {
+      // 1. pack the bytes
+      __m256i latin1_packed_tmp = __lasx_xvshuf_b(in2, in1, shuf_mask);
+      latin1_packed_tmp = __lasx_xvpermi_d(latin1_packed_tmp, 0b00001000);
+      __m128i latin1_packed = lasx_extracti128_lo(latin1_packed_tmp);
+      latin1_packed = __lsx_vpermi_w(latin1_packed, latin1_packed, 0b11011000);
+      // 2. store (8 bytes)
+      __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+      // 3. adjust pointers
+      buf += 16;
+      latin1_output += 16;
+    } else {
+      // Let us do a scalar fallback.
       for (int k = 0; k < 16; k++) {
-        char32_t codepoint = buf[k];
-        if (codepoint <= 0xff) {
-          *latin1_output++ = char(codepoint);
+        uint32_t word = buf[k];
+        if (word <= 0xff) {
+          *latin1_output++ = char(word);
         } else {
           return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
                                 latin1_output);
         }
       }
-      buf += 16;
-      continue;
     }
-    __m128i pack1 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in1, shufmask),
-                                       _mm_shuffle_epi8(in2, shufmask));
-    __m128i pack2 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in3, shufmask),
-                                       _mm_shuffle_epi8(in4, shufmask));
-    __m128i pack = _mm_unpacklo_epi64(pack1, pack2);
-    _mm_storeu_si128((__m128i *)latin1_output, pack);
-    latin1_output += 16;
-    buf += 16;
-  }
-
+  } // while
   return std::make_pair(result(error_code::SUCCESS, buf - start),
                         latin1_output);
 }
-/* end file src/westmere/sse_convert_utf32_to_latin1.cpp */
-/* begin file src/westmere/sse_convert_utf32_to_utf8.cpp */
+/* end file src/lasx/lasx_convert_utf32_to_latin1.cpp */
+/* begin file src/lasx/lasx_convert_utf32_to_utf8.cpp */
 std::pair<const char32_t *, char *>
-sse_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
+lasx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
   const char32_t *end = buf + len;
 
-  const __m128i v_0000 = _mm_setzero_si128();              //__m128 = 128 bits
-  const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); // 1111 1000 0000
-                                                           // 0000
-  const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080); // 1100 0000 1000
-                                                           // 0000
-  const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); // 1111 1111 1000
-                                                           // 0000
-  const __m128i v_ffff0000 = _mm_set1_epi32(
-      (uint32_t)0xffff0000); // 1111 1111 1111 1111 0000 0000 0000 0000
-  const __m128i v_7fffffff = _mm_set1_epi32(
-      (uint32_t)0x7fffffff); // 0111 1111 1111 1111 1111 1111 1111 1111
-  __m128i running_max = _mm_setzero_si128();
-  __m128i forbidden_bytemask = _mm_setzero_si128();
+  // load addr align 32
+  while (((uint64_t)buf & 0x1F) && buf < end) {
+    uint32_t word = *buf;
+    if ((word & 0xFFFFFF80) == 0) {
+      *utf8_output++ = char(word);
+    } else if ((word & 0xFFFFF800) == 0) {
+      *utf8_output++ = char((word >> 6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+    } else if ((word & 0xFFFF0000) == 0) {
+      if (word >= 0xD800 && word <= 0xDFFF) {
+        return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
+      }
+      *utf8_output++ = char((word >> 12) | 0b11100000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+    } else {
+      if (word > 0x10FFFF) {
+        return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
+      }
+      *utf8_output++ = char((word >> 18) | 0b11110000);
+      *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+    }
+    buf++;
+  }
+
+  __m256i v_c080 = __lasx_xvreplgr2vr_h(uint16_t(0xC080));
+  __m256i v_07ff = __lasx_xvreplgr2vr_h(uint16_t(0x7FF));
+  __m256i v_dfff = __lasx_xvreplgr2vr_h(uint16_t(0xDFFF));
+  __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
+  __m256i zero = __lasx_xvldi(0);
+  __m128i zero_128 = __lsx_vldi(0);
+  __m256i forbidden_bytemask = __lasx_xvldi(0x0);
+
   const size_t safety_margin =
       12; // to avoid overruns, see issue
           // https://github.com/simdutf/simdutf/issues/92
 
-  while (end - buf >=
-         std::ptrdiff_t(
-             16 + safety_margin)) { // buf is a char32_t pointer, each char32_t
-                                    // has 4 bytes or 32 bits, thus buf + 16 *
-                                    // char_32t = 512 bits = 64 bytes
-    // We load two 16 bytes registers for a total of 32 bytes or 16 characters.
-    __m128i in = _mm_loadu_si128((__m128i *)buf);
-    __m128i nextin = _mm_loadu_si128(
-        (__m128i *)buf + 1); // These two values can hold only 8 UTF32 chars
-    running_max = _mm_max_epu32(
-        _mm_max_epu32(in, running_max), // take element-wise max char32_t from
-                                        // in and running_max vector
-        nextin); // and take element-wise max element from nextin and
-                 // running_max vector
-
-    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
-    // saturation
-    __m128i in_16 = _mm_packus_epi32(
-        _mm_and_si128(in, v_7fffffff),
-        _mm_and_si128(
-            nextin,
-            v_7fffffff)); // in this context pack the two __m128 into a single
-    // By ensuring the highest bit is set to 0(&v_7fffffff), we are making sure
-    // all values are interpreted as non-negative, or specifically, the values
-    // are within the range of valid Unicode code points. remember : having
-    // leading byte 0 means a positive number by the two complements system.
-    // Unicode is well beneath the range where you'll start getting issues so
-    // that's OK.
-
-    // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
+  while (buf + 16 + safety_margin < end) {
+    __m256i in = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m256i nextin = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
 
-    // Check for ASCII fast path
+    // Check if no bits set above 16th
+    if (__lasx_xbz_v(__lasx_xvpickod_h(in, nextin))) {
+      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+      // Apply UTF-16 => UTF-8 routine (lasx_convert_utf16_to_utf8.cpp)
+      __m256i utf16_packed =
+          __lasx_xvpermi_d(__lasx_xvpickev_h(nextin, in), 0b11011000);
 
-    // ASCII fast path!!!!
-    // We eagerly load another 32 bytes, hoping that they will be ASCII too.
-    // The intuition is that we try to collect 16 ASCII characters which
-    // requires a total of 64 bytes of input. If we fail, we just pass thirdin
-    // and fourthin as our new inputs.
-    if (_mm_testz_si128(in_16, v_ff80)) { // if the first two blocks are ASCII
-      __m128i thirdin = _mm_loadu_si128((__m128i *)buf + 2);
-      __m128i fourthin = _mm_loadu_si128((__m128i *)buf + 3);
-      running_max = _mm_max_epu32(
-          _mm_max_epu32(thirdin, running_max),
-          fourthin); // take the running max of all 4 vectors thus far
-      __m128i nextin_16 = _mm_packus_epi32(
-          _mm_and_si128(thirdin, v_7fffffff),
-          _mm_and_si128(fourthin,
-                        v_7fffffff)); // pack into 1 vector, now you have two
-      if (!_mm_testz_si128(
-              nextin_16,
-              v_ff80)) { // checks if the second packed vector is ASCII, if not:
+      if (__lasx_xbz_v(__lasx_xvslt_hu(__lasx_xvrepli_h(0x7F),
+                                       utf16_packed))) { // ASCII fast path!!!!
         // 1. pack the bytes
         // obviously suboptimal.
-        const __m128i utf8_packed = _mm_packus_epi16(
-            in_16, in_16); // creates two copy of in_16 in 1 vector
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i *)utf8_output,
-                         utf8_packed); // put them into the output
-        // 3. adjust pointers
-        buf += 8; // the char32_t buffer pointer goes up 8 char32_t chars* 32
-                  // bits =  256 bits
-        utf8_output +=
-            8; // same with output, e.g. lift the first two blocks alone.
-        // Proceed with next input
-        in_16 = nextin_16;
-        // We need to update in and nextin because they are used later.
-        in = thirdin;
-        nextin = fourthin;
-      } else {
-        // 1. pack the bytes
-        const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+        __m256i utf8_packed = __lasx_xvpermi_d(
+            __lasx_xvpickev_b(utf16_packed, utf16_packed), 0b00001000);
+        // 2. store (8 bytes)
+        __lsx_vst(lasx_extracti128_lo(utf8_packed), utf8_output, 0);
         // 3. adjust pointers
         buf += 16;
         utf8_output += 16;
         continue; // we are done for this round!
       }
-    }
-
-    // no bits set above 7th bit -- find out all the ASCII characters
-    const __m128i one_byte_bytemask =
-        _mm_cmpeq_epi16( // this takes four bytes at a time and compares:
-            _mm_and_si128(in_16, v_ff80), // the vector that get only the first
-                                          // 9 bits of each 16-bit/2-byte units
-            v_0000                        //
-        ); // they should be all zero if they are ASCII. E.g. ASCII in UTF32 is
-           // of format 0000 0000 0000 0XXX XXXX
-    // _mm_cmpeq_epi16 should now return a 1111 1111 1111 1111 for equals, and
-    // 0000 0000 0000 0000 if not for each 16-bit/2-byte units
-    const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(
-        one_byte_bytemask)); // collect the MSB from previous vector and put
-                             // them into uint16_t mas
-
-    // no bits set above 11th bit
-    const __m128i one_or_two_bytes_bytemask =
-        _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
-    const uint16_t one_or_two_bytes_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
-
-    if (one_or_two_bytes_bitmask == 0xffff) {
-      // case: all code units either produce 1 or 2 UTF-8 bytes (at least one
-      // produces 2 bytes)
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m128i v_1f00 =
-          _mm_set1_epi16((int16_t)0x1f00); // 0001 1111 0000 0000
-      const __m128i v_003f =
-          _mm_set1_epi16((int16_t)0x003f); // 0000 0000 0011 1111
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m128i t0 = _mm_slli_epi16(in_16, 2); // shift packed vector by two
-      // t1 = [000a|aaaa|0000|0000]
-      const __m128i t1 =
-          _mm_and_si128(t0, v_1f00); // potentital first utf8 byte
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m128i t2 =
-          _mm_and_si128(in_16, v_003f); // potential second utf8 byte
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m128i t3 =
-          _mm_or_si128(t1, t2); // first and second potential utf8 byte together
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m128i t4 = _mm_or_si128(
-          t3,
-          v_c080); // t3 | 1100 0000 1000 0000 = full potential 2-byte utf8 unit
-
-      // 2. merge ASCII and 2-byte codewords
-      const __m128i utf8_unpacked =
-          _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
-
-      // 3. prepare bitmask for 8-bit lookup
-      //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h -
-      //    MSB, a - LSB)
-      const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
-      const uint16_t m1 =
-          static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
-      const uint8_t m2 =
-          static_cast<uint8_t>((m0 | m1) & 0xff); // m2 =         hdgcfbea
-      // 4. pack the bytes
-      const uint8_t *row =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
-      const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
-
-      // 5. store bytes
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-
-      // 6. adjust pointers
-      buf += 8;
-      utf8_output += row[0];
-      continue;
-    }
-
-    // Check for overflow in packing
-
-    const __m128i saturation_bytemask = _mm_cmpeq_epi32(
-        _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
-    if (saturation_bitmask == 0xffff) {
-      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
-      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
-      forbidden_bytemask =
-          _mm_or_si128(forbidden_bytemask,
-                       _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800));
-
-      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-      /* In this branch we handle three cases:
-          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
-        single UFT-8 byte
-          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
-        two UTF-8 bytes
-          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
-        three UTF-8 bytes
-
-        We expand the input word (16-bit) into two code units (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
 
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
-
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
-
-        Finally from these two code units we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
+      if (__lasx_xbz_v(__lasx_xvslt_hu(v_07ff, utf16_packed))) {
+        // 1. prepare 2-byte values
+        // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+        // expected output   : [110a|aaaa|10bb|bbbb] x 8
 
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m128i s0 = _mm_srli_epi16(in_16, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
-      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
-                                          simdutf_vec(0b0100000000000000));
-      const __m128i s4 = _mm_xor_si128(s3, m0);
-#undef simdutf_vec
+        // t0 = [000a|aaaa|bbbb|bb00]
+        const __m256i t0 = __lasx_xvslli_h(utf16_packed, 2);
+        // t1 = [000a|aaaa|0000|0000]
+        const __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785 /*0x1f00*/));
+        // t2 = [0000|0000|00bb|bbbb]
+        const __m256i t2 = __lasx_xvand_v(utf16_packed, __lasx_xvrepli_h(0x3f));
+        // t3 = [000a|aaaa|00bb|bbbb]
+        const __m256i t3 = __lasx_xvor_v(t1, t2);
+        // t4 = [110a|aaaa|10bb|bbbb]
+        const __m256i t4 = __lasx_xvor_v(t3, v_c080);
+        // 2. merge ASCII and 2-byte codewords
+        __m256i one_byte_bytemask =
+            __lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F /*0x007F*/));
+        __m256i utf8_unpacked =
+            __lasx_xvbitsel_v(t4, utf16_packed, one_byte_bytemask);
+        // 3. prepare bitmask for 8-bit lookup
+        __m256i mask = __lasx_xvmskltz_h(one_byte_bytemask);
+        uint32_t m1 = __lasx_xvpickve2gr_wu(mask, 0);
+        uint32_t m2 = __lasx_xvpickve2gr_wu(mask, 4);
+        // 4. pack the bytes
+        const uint8_t *row1 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                [lasx_1_2_utf8_bytes_mask[m1]][0];
+        __m128i shuffle1 = __lsx_vld(row1, 1);
+        __m128i utf8_packed1 = __lsx_vshuf_b(
+            zero_128, lasx_extracti128_lo(utf8_unpacked), shuffle1);
+
+        const uint8_t *row2 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                [lasx_1_2_utf8_bytes_mask[m2]][0];
+        __m128i shuffle2 = __lsx_vld(row2, 1);
+        __m128i utf8_packed2 = __lsx_vshuf_b(
+            zero_128, lasx_extracti128_hi(utf8_unpacked), shuffle2);
+        // 5. store bytes
+        __lsx_vst(utf8_packed1, utf8_output, 0);
+        utf8_output += row1[0];
 
-      // 4. expand code units 16-bit => 32-bit
-      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
-      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+        __lsx_vst(utf8_packed2, utf8_output, 0);
+        utf8_output += row2[0];
 
-      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint16_t mask =
-          (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
-      if (mask == 0) {
-        // We only have three-byte code units. Use fast path.
-        const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
-                                              15, 13, -1, -1, -1, -1);
-        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
-        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
-        utf8_output += 12;
-        buf += 8;
+        buf += 16;
         continue;
-      }
-      const uint8_t mask0 = uint8_t(mask);
+      } else {
+        // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+        forbidden_bytemask = __lasx_xvor_v(
+            __lasx_xvand_v(
+                __lasx_xvsle_h(utf16_packed, v_dfff),  // utf16_packed <= 0xdfff
+                __lasx_xvsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+            forbidden_bytemask);
+        /* In this branch we handle three cases:
+            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+           single UFT-8 byte
+            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
+           two UTF-8 bytes
+            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+           three UTF-8 bytes
+
+            We expand the input word (16-bit) into two code units (32-bit), thus
+            we have room for four bytes. However, we need five distinct bit
+            layouts. Note that the last byte in cases #2 and #3 is the same.
+
+            We precompute byte 1 for case #1 and the common byte for cases #2 &
+           #3 in register t2.
+
+            We precompute byte 1 for case #3 and -- **conditionally** --
+           precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+           they differ by exactly one bit.
+
+            Finally from these two code units we build proper UTF-8 sequence,
+           taking into account the case (i.e, the number of bytes to write).
+        */
+        /**
+         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+         * t2 => [0ccc|cccc] [10cc|cccc]
+         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+         */
+        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+        __m256i t0 = __lasx_xvpickev_b(utf16_packed, utf16_packed);
+        t0 = __lasx_xvilvl_b(t0, t0);
+        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+        __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F));
+        __m256i t1 = __lasx_xvand_v(t0, v_3f7f);
+        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+        __m256i t2 = __lasx_xvor_v(t1, __lasx_xvldi(-2688 /*0x8000*/));
 
-      const uint8_t *row0 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
-      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
+        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+        __m256i s0 = __lasx_xvsrli_h(utf16_packed, 12);
+        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+        __m256i s1 = __lasx_xvslli_h(utf16_packed, 2);
+        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+        s1 = __lasx_xvand_v(s1, __lasx_xvldi(-2753 /*0x3F00*/));
+        // [00bb|bbbb|0000|aaaa]
+        __m256i s2 = __lasx_xvor_v(s0, s1);
+        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+        __m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0));
+        __m256i s3 = __lasx_xvor_v(s2, v_c0e0);
+        // __m256i v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+        __m256i one_or_two_bytes_bytemask =
+            __lasx_xvsle_hu(utf16_packed, v_07ff);
+        __m256i m0 = __lasx_xvandn_v(one_or_two_bytes_bytemask,
+                                     __lasx_xvldi(-2752 /*0x4000*/));
+        __m256i s4 = __lasx_xvxor_v(s3, m0);
 
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+        // 4. expand code units 16-bit => 32-bit
+        __m256i out0 = __lasx_xvilvl_h(s4, t2);
+        __m256i out1 = __lasx_xvilvh_h(s4, t2);
 
-      const uint8_t *row1 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
-      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
+        // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+        __m256i one_byte_bytemask =
+            __lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F));
+
+        __m256i one_or_two_bytes_bytemask_u16_to_u32_low =
+            __lasx_xvilvl_h(one_or_two_bytes_bytemask, zero);
+        __m256i one_or_two_bytes_bytemask_u16_to_u32_high =
+            __lasx_xvilvh_h(one_or_two_bytes_bytemask, zero);
+
+        __m256i one_byte_bytemask_u16_to_u32_low =
+            __lasx_xvilvl_h(one_byte_bytemask, one_byte_bytemask);
+        __m256i one_byte_bytemask_u16_to_u32_high =
+            __lasx_xvilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+        __m256i mask0 = __lasx_xvmskltz_h(
+            __lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_low,
+                          one_byte_bytemask_u16_to_u32_low));
+        __m256i mask1 = __lasx_xvmskltz_h(
+            __lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_high,
+                          one_byte_bytemask_u16_to_u32_high));
+
+        uint32_t mask = __lasx_xvpickve2gr_wu(mask0, 0);
+        const uint8_t *row0 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                  [0];
+        __m128i shuffle0 = __lsx_vld(row0, 1);
+        __m128i utf8_0 =
+            __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out0), shuffle0);
+        __lsx_vst(utf8_0, utf8_output, 0);
+        utf8_output += row0[0];
 
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
-      utf8_output += row1[0];
+        mask = __lasx_xvpickve2gr_wu(mask1, 0);
+        const uint8_t *row1 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                  [0];
+        __m128i shuffle1 = __lsx_vld(row1, 1);
+        __m128i utf8_1 =
+            __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out1), shuffle1);
+        __lsx_vst(utf8_1, utf8_output, 0);
+        utf8_output += row1[0];
 
-      buf += 8;
+        mask = __lasx_xvpickve2gr_wu(mask0, 4);
+        const uint8_t *row2 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                  [0];
+        __m128i shuffle2 = __lsx_vld(row2, 1);
+        __m128i utf8_2 =
+            __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out0), shuffle2);
+        __lsx_vst(utf8_2, utf8_output, 0);
+        utf8_output += row2[0];
+
+        mask = __lasx_xvpickve2gr_wu(mask1, 4);
+        const uint8_t *row3 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                  [0];
+        __m128i shuffle3 = __lsx_vld(row3, 1);
+        __m128i utf8_3 =
+            __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out1), shuffle3);
+        __lsx_vst(utf8_3, utf8_output, 0);
+        utf8_output += row3[0];
+
+        buf += 16;
+      }
+      // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
+      // will produce four UTF-8 bytes.
     } else {
-      // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=>
-      // will produce four UTF-8 bytes Let us do a scalar fallback. It may seem
-      // wasteful to use scalar code, but being efficient with SIMD in the
-      // presence of surrogate pairs may require non-trivial tables.
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
       size_t forward = 15;
       size_t k = 0;
       if (size_t(end - buf) < forward + 1) {
@@ -37313,14 +52280,16 @@ sse_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
           *utf8_output++ = char((word & 0b111111) | 0b10000000);
         } else if ((word & 0xFFFF0000) == 0) {
           if (word >= 0xD800 && word <= 0xDFFF) {
-            return std::make_pair(nullptr, utf8_output);
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char *>(utf8_output));
           }
           *utf8_output++ = char((word >> 12) | 0b11100000);
           *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
           *utf8_output++ = char((word & 0b111111) | 0b10000000);
         } else {
           if (word > 0x10FFFF) {
-            return std::make_pair(nullptr, utf8_output);
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char *>(utf8_output));
           }
           *utf8_output++ = char((word >> 18) | 0b11110000);
           *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
@@ -37333,242 +52302,269 @@ sse_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
   } // while
 
   // check for invalid input
-  const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
-  if (static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(
-          _mm_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffff) {
-    return std::make_pair(nullptr, utf8_output);
-  }
-
-  if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
-    return std::make_pair(nullptr, utf8_output);
+  if (__lasx_xbnz_v(forbidden_bytemask)) {
+    return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
   }
-
-  return std::make_pair(buf, utf8_output);
+  return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
 }
 
 std::pair<result, char *>
-sse_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
-                                      char *utf8_output) {
-  const char32_t *end = buf + len;
+lasx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
+                                       char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
   const char32_t *start = buf;
+  const char32_t *end = buf + len;
 
-  const __m128i v_0000 = _mm_setzero_si128();
-  const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
-  const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080);
-  const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
-  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
-  const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff);
-  const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
+  // load addr align 32
+  while (((uint64_t)buf & 0x1F) && buf < end) {
+    uint32_t word = *buf;
+    if ((word & 0xFFFFFF80) == 0) {
+      *utf8_output++ = char(word);
+    } else if ((word & 0xFFFFF800) == 0) {
+      *utf8_output++ = char((word >> 6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+    } else if ((word & 0xFFFF0000) == 0) {
+      if (word >= 0xD800 && word <= 0xDFFF) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start),
+                              reinterpret_cast<char *>(utf8_output));
+      }
+      *utf8_output++ = char((word >> 12) | 0b11100000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+    } else {
+      if (word > 0x10FFFF) {
+        return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+                              reinterpret_cast<char *>(utf8_output));
+      }
+      *utf8_output++ = char((word >> 18) | 0b11110000);
+      *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+    }
+    buf++;
+  }
 
+  __m256i v_c080 = __lasx_xvreplgr2vr_h(uint16_t(0xC080));
+  __m256i v_07ff = __lasx_xvreplgr2vr_h(uint16_t(0x7FF));
+  __m256i v_dfff = __lasx_xvreplgr2vr_h(uint16_t(0xDFFF));
+  __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
+  __m256i zero = __lasx_xvldi(0);
+  __m128i zero_128 = __lsx_vldi(0);
+  __m256i forbidden_bytemask = __lasx_xvldi(0x0);
   const size_t safety_margin =
       12; // to avoid overruns, see issue
           // https://github.com/simdutf/simdutf/issues/92
 
-  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
-    // We load two 16 bytes registers for a total of 32 bytes or 8 characters.
-    __m128i in = _mm_loadu_si128((__m128i *)buf);
-    __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
-    // Check for too large input
-    __m128i max_input = _mm_max_epu32(_mm_max_epu32(in, nextin), v_10ffff);
-    if (static_cast<uint16_t>(_mm_movemask_epi8(
-            _mm_cmpeq_epi32(max_input, v_10ffff))) != 0xffff) {
-      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
-                            utf8_output);
-    }
-
-    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
-    // saturation
-    __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff),
-                                     _mm_and_si128(nextin, v_7fffffff));
-
-    // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
-
-    // Check for ASCII fast path
-    if (_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!!
-      // 1. pack the bytes
-      // obviously suboptimal.
-      const __m128i utf8_packed = _mm_packus_epi16(in_16, in_16);
-      // 2. store (16 bytes)
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-      // 3. adjust pointers
-      buf += 8;
-      utf8_output += 8;
-      continue;
-    }
-
-    // no bits set above 7th bit
-    const __m128i one_byte_bytemask =
-        _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000);
-    const uint16_t one_byte_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m128i one_or_two_bytes_bytemask =
-        _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
-    const uint16_t one_or_two_bytes_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
-
-    if (one_or_two_bytes_bitmask == 0xffff) {
-      // case: all code units either produce 1 or 2 UTF-8 bytes (at least one
-      // produces 2 bytes)
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
-      const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m128i t0 = _mm_slli_epi16(in_16, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const __m128i t1 = _mm_and_si128(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m128i t2 = _mm_and_si128(in_16, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m128i t3 = _mm_or_si128(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m128i t4 = _mm_or_si128(t3, v_c080);
-
-      // 2. merge ASCII and 2-byte codewords
-      const __m128i utf8_unpacked =
-          _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
-
-      // 3. prepare bitmask for 8-bit lookup
-      //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h -
-      //    MSB, a - LSB)
-      const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
-      const uint16_t m1 =
-          static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
-      const uint8_t m2 =
-          static_cast<uint8_t>((m0 | m1) & 0xff); // m2 =         hdgcfbea
-      // 4. pack the bytes
-      const uint8_t *row =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
-      const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
-
-      // 5. store bytes
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-
-      // 6. adjust pointers
-      buf += 8;
-      utf8_output += row[0];
-      continue;
-    }
-
-    // Check for overflow in packing
-    const __m128i saturation_bytemask = _mm_cmpeq_epi32(
-        _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+  while (buf + 16 + safety_margin < end) {
+    __m256i in = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m256i nextin = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
 
-    if (saturation_bitmask == 0xffff) {
-      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+    // Check if no bits set above 16th
+    if (__lasx_xbz_v(__lasx_xvpickod_h(in, nextin))) {
+      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+      // Apply UTF-16 => UTF-8 routine (lasx_convert_utf16_to_utf8.cpp)
+      __m256i utf16_packed =
+          __lasx_xvpermi_d(__lasx_xvpickev_h(nextin, in), 0b11011000);
 
-      // Check for illegal surrogate code units
-      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
-      const __m128i forbidden_bytemask =
-          _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
-        return std::make_pair(result(error_code::SURROGATE, buf - start),
-                              utf8_output);
+      if (__lasx_xbz_v(__lasx_xvslt_hu(__lasx_xvrepli_h(0x7F),
+                                       utf16_packed))) { // ASCII fast path!!!!
+        // 1. pack the bytes
+        // obviously suboptimal.
+        __m256i utf8_packed = __lasx_xvpermi_d(
+            __lasx_xvpickev_b(utf16_packed, utf16_packed), 0b00001000);
+        // 2. store (8 bytes)
+        __lsx_vst(lasx_extracti128_lo(utf8_packed), utf8_output, 0);
+        // 3. adjust pointers
+        buf += 16;
+        utf8_output += 16;
+        continue; // we are done for this round!
       }
 
-      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-      /* In this branch we handle three cases:
-          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
-        single UFT-8 byte
-          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
-        two UTF-8 bytes
-          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
-        three UTF-8 bytes
-
-        We expand the input word (16-bit) into two code units (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
-
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
-
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
-
-        Finally from these two code units we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
+      if (__lasx_xbz_v(__lasx_xvslt_hu(v_07ff, utf16_packed))) {
+        // 1. prepare 2-byte values
+        // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+        // expected output   : [110a|aaaa|10bb|bbbb] x 8
 
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m128i s0 = _mm_srli_epi16(in_16, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
-      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
-                                          simdutf_vec(0b0100000000000000));
-      const __m128i s4 = _mm_xor_si128(s3, m0);
-#undef simdutf_vec
+        // t0 = [000a|aaaa|bbbb|bb00]
+        const __m256i t0 = __lasx_xvslli_h(utf16_packed, 2);
+        // t1 = [000a|aaaa|0000|0000]
+        const __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785 /*0x1f00*/));
+        // t2 = [0000|0000|00bb|bbbb]
+        const __m256i t2 = __lasx_xvand_v(utf16_packed, __lasx_xvrepli_h(0x3f));
+        // t3 = [000a|aaaa|00bb|bbbb]
+        const __m256i t3 = __lasx_xvor_v(t1, t2);
+        // t4 = [110a|aaaa|10bb|bbbb]
+        const __m256i t4 = __lasx_xvor_v(t3, v_c080);
+        // 2. merge ASCII and 2-byte codewords
+        __m256i one_byte_bytemask =
+            __lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F /*0x007F*/));
+        __m256i utf8_unpacked =
+            __lasx_xvbitsel_v(t4, utf16_packed, one_byte_bytemask);
+        // 3. prepare bitmask for 8-bit lookup
+        __m256i mask = __lasx_xvmskltz_h(one_byte_bytemask);
+        uint32_t m1 = __lasx_xvpickve2gr_wu(mask, 0);
+        uint32_t m2 = __lasx_xvpickve2gr_wu(mask, 4);
+        // 4. pack the bytes
+        const uint8_t *row1 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                [lasx_1_2_utf8_bytes_mask[m1]][0];
+        __m128i shuffle1 = __lsx_vld(row1, 1);
+        __m128i utf8_packed1 = __lsx_vshuf_b(
+            zero_128, lasx_extracti128_lo(utf8_unpacked), shuffle1);
+
+        const uint8_t *row2 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                [lasx_1_2_utf8_bytes_mask[m2]][0];
+        __m128i shuffle2 = __lsx_vld(row2, 1);
+        __m128i utf8_packed2 = __lsx_vshuf_b(
+            zero_128, lasx_extracti128_hi(utf8_unpacked), shuffle2);
+        // 5. store bytes
+        __lsx_vst(utf8_packed1, utf8_output, 0);
+        utf8_output += row1[0];
 
-      // 4. expand code units 16-bit => 32-bit
-      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
-      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+        __lsx_vst(utf8_packed2, utf8_output, 0);
+        utf8_output += row2[0];
 
-      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint16_t mask =
-          (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
-      if (mask == 0) {
-        // We only have three-byte code units. Use fast path.
-        const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
-                                              15, 13, -1, -1, -1, -1);
-        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
-        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
-        utf8_output += 12;
-        buf += 8;
+        buf += 16;
         continue;
-      }
-      const uint8_t mask0 = uint8_t(mask);
+      } else {
+        // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+        forbidden_bytemask = __lasx_xvor_v(
+            __lasx_xvand_v(
+                __lasx_xvsle_h(utf16_packed, v_dfff),  // utf16_packed <= 0xdfff
+                __lasx_xvsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+            forbidden_bytemask);
+        if (__lasx_xbnz_v(forbidden_bytemask)) {
+          return std::make_pair(result(error_code::SURROGATE, buf - start),
+                                reinterpret_cast<char *>(utf8_output));
+        }
+        /* In this branch we handle three cases:
+            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+           single UFT-8 byte
+            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
+           two UTF-8 bytes
+            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+           three UTF-8 bytes
+
+            We expand the input word (16-bit) into two code units (32-bit), thus
+            we have room for four bytes. However, we need five distinct bit
+            layouts. Note that the last byte in cases #2 and #3 is the same.
+
+            We precompute byte 1 for case #1 and the common byte for cases #2 &
+           #3 in register t2.
+
+            We precompute byte 1 for case #3 and -- **conditionally** --
+           precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+           they differ by exactly one bit.
+
+            Finally from these two code units we build proper UTF-8 sequence,
+           taking into account the case (i.e, the number of bytes to write).
+        */
+        /**
+         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+         * t2 => [0ccc|cccc] [10cc|cccc]
+         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+         */
+        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+        __m256i t0 = __lasx_xvpickev_b(utf16_packed, utf16_packed);
+        t0 = __lasx_xvilvl_b(t0, t0);
+        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+        __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F));
+        __m256i t1 = __lasx_xvand_v(t0, v_3f7f);
+        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+        __m256i t2 = __lasx_xvor_v(t1, __lasx_xvldi(-2688 /*0x8000*/));
 
-      const uint8_t *row0 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
-      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
+        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+        __m256i s0 = __lasx_xvsrli_h(utf16_packed, 12);
+        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+        __m256i s1 = __lasx_xvslli_h(utf16_packed, 2);
+        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+        s1 = __lasx_xvand_v(s1, __lasx_xvldi(-2753 /*0x3F00*/));
+        // [00bb|bbbb|0000|aaaa]
+        __m256i s2 = __lasx_xvor_v(s0, s1);
+        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+        __m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0));
+        __m256i s3 = __lasx_xvor_v(s2, v_c0e0);
+        // __m256i v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+        __m256i one_or_two_bytes_bytemask =
+            __lasx_xvsle_hu(utf16_packed, v_07ff);
+        __m256i m0 = __lasx_xvandn_v(one_or_two_bytes_bytemask,
+                                     __lasx_xvldi(-2752 /*0x4000*/));
+        __m256i s4 = __lasx_xvxor_v(s3, m0);
 
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+        // 4. expand code units 16-bit => 32-bit
+        __m256i out0 = __lasx_xvilvl_h(s4, t2);
+        __m256i out1 = __lasx_xvilvh_h(s4, t2);
 
-      const uint8_t *row1 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
-      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
+        // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+        __m256i one_byte_bytemask =
+            __lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F));
+
+        __m256i one_or_two_bytes_bytemask_u16_to_u32_low =
+            __lasx_xvilvl_h(one_or_two_bytes_bytemask, zero);
+        __m256i one_or_two_bytes_bytemask_u16_to_u32_high =
+            __lasx_xvilvh_h(one_or_two_bytes_bytemask, zero);
+
+        __m256i one_byte_bytemask_u16_to_u32_low =
+            __lasx_xvilvl_h(one_byte_bytemask, one_byte_bytemask);
+        __m256i one_byte_bytemask_u16_to_u32_high =
+            __lasx_xvilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+        __m256i mask0 = __lasx_xvmskltz_h(
+            __lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_low,
+                          one_byte_bytemask_u16_to_u32_low));
+        __m256i mask1 = __lasx_xvmskltz_h(
+            __lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_high,
+                          one_byte_bytemask_u16_to_u32_high));
+
+        uint32_t mask = __lasx_xvpickve2gr_wu(mask0, 0);
+        const uint8_t *row0 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                  [0];
+        __m128i shuffle0 = __lsx_vld(row0, 1);
+        __m128i utf8_0 =
+            __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out0), shuffle0);
+        __lsx_vst(utf8_0, utf8_output, 0);
+        utf8_output += row0[0];
 
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
-      utf8_output += row1[0];
+        mask = __lasx_xvpickve2gr_wu(mask1, 0);
+        const uint8_t *row1 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                  [0];
+        __m128i shuffle1 = __lsx_vld(row1, 1);
+        __m128i utf8_1 =
+            __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out1), shuffle1);
+        __lsx_vst(utf8_1, utf8_output, 0);
+        utf8_output += row1[0];
 
-      buf += 8;
+        mask = __lasx_xvpickve2gr_wu(mask0, 4);
+        const uint8_t *row2 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                  [0];
+        __m128i shuffle2 = __lsx_vld(row2, 1);
+        __m128i utf8_2 =
+            __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out0), shuffle2);
+        __lsx_vst(utf8_2, utf8_output, 0);
+        utf8_output += row2[0];
+
+        mask = __lasx_xvpickve2gr_wu(mask1, 4);
+        const uint8_t *row3 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                  [0];
+        __m128i shuffle3 = __lsx_vld(row3, 1);
+        __m128i utf8_3 =
+            __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out1), shuffle3);
+        __lsx_vst(utf8_3, utf8_output, 0);
+        utf8_output += row3[0];
+
+        buf += 16;
+      }
+      // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
+      // will produce four UTF-8 bytes.
     } else {
-      // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=>
-      // will produce four UTF-8 bytes Let us do a scalar fallback. It may seem
-      // wasteful to use scalar code, but being efficient with SIMD in the
-      // presence of surrogate pairs may require non-trivial tables.
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
       size_t forward = 15;
       size_t k = 0;
       if (size_t(end - buf) < forward + 1) {
@@ -37584,7 +52580,8 @@ sse_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
         } else if ((word & 0xFFFF0000) == 0) {
           if (word >= 0xD800 && word <= 0xDFFF) {
             return std::make_pair(
-                result(error_code::SURROGATE, buf - start + k), utf8_output);
+                result(error_code::SURROGATE, buf - start + k),
+                reinterpret_cast<char *>(utf8_output));
           }
           *utf8_output++ = char((word >> 12) | 0b11100000);
           *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
@@ -37592,7 +52589,8 @@ sse_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
         } else {
           if (word > 0x10FFFF) {
             return std::make_pair(
-                result(error_code::TOO_LARGE, buf - start + k), utf8_output);
+                result(error_code::TOO_LARGE, buf - start + k),
+                reinterpret_cast<char *>(utf8_output));
           }
           *utf8_output++ = char((word >> 18) | 0b11110000);
           *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
@@ -37603,51 +52601,76 @@ sse_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
       buf += k;
     }
   } // while
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char *>(utf8_output));
 }
-/* end file src/westmere/sse_convert_utf32_to_utf8.cpp */
-/* begin file src/westmere/sse_convert_utf32_to_utf16.cpp */
+/* end file src/lasx/lasx_convert_utf32_to_utf8.cpp */
+/* begin file src/lasx/lasx_convert_utf32_to_utf16.cpp */
 template <endianness big_endian>
 std::pair<const char32_t *, char16_t *>
-sse_convert_utf32_to_utf16(const char32_t *buf, size_t len,
-                           char16_t *utf16_output) {
-
+lasx_convert_utf32_to_utf16(const char32_t *buf, size_t len,
+                            char16_t *utf16_out) {
+  uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
   const char32_t *end = buf + len;
 
-  const __m128i v_0000 = _mm_setzero_si128();
-  const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
-  __m128i forbidden_bytemask = _mm_setzero_si128();
+  // Performance degradation when memory address is not 32-byte aligned
+  while (((uint64_t)utf16_output & 0x1F) && buf < end) {
+    uint32_t word = *buf++;
+    if ((word & 0xFFFF0000) == 0) {
+      // will not generate a surrogate pair
+      if (word >= 0xD800 && word <= 0xDFFF) {
+        return std::make_pair(nullptr,
+                              reinterpret_cast<char16_t *>(utf16_output));
+      }
+      *utf16_output++ = !match_system(big_endian)
+                            ? char16_t(word >> 8 | word << 8)
+                            : char16_t(word);
+      // buf++;
+    } else {
+      // will generate a surrogate pair
+      if (word > 0x10FFFF) {
+        return std::make_pair(nullptr,
+                              reinterpret_cast<char16_t *>(utf16_output));
+      }
+      word -= 0x10000;
+      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+      if (!match_system(big_endian)) {
+        high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+        low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+      }
+      *utf16_output++ = char16_t(high_surrogate);
+      *utf16_output++ = char16_t(low_surrogate);
+      // buf++;
+    }
+  }
 
-  while (end - buf >= 8) {
-    __m128i in = _mm_loadu_si128((__m128i *)buf);
-    __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
-    const __m128i saturation_bytemask = _mm_cmpeq_epi32(
-        _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+  __m256i forbidden_bytemask = __lasx_xvrepli_h(0);
+  __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
+  __m256i v_dfff = __lasx_xvreplgr2vr_h(uint16_t(0xdfff));
+  while (buf + 16 <= end) {
+    __m256i in0 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m256i in1 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
 
     // Check if no bits set above 16th
-    if (saturation_bitmask == 0xffff) {
-      // Pack UTF-32 to UTF-16
-      __m128i utf16_packed = _mm_packus_epi32(in, nextin);
-
-      const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
-      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
-      forbidden_bytemask = _mm_or_si128(
-          forbidden_bytemask,
-          _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800));
+    if (__lasx_xbz_v(__lasx_xvpickod_h(in1, in0))) {
+      __m256i utf16_packed =
+          __lasx_xvpermi_d(__lasx_xvpickev_h(in1, in0), 0b11011000);
+      forbidden_bytemask = __lasx_xvor_v(
+          __lasx_xvand_v(
+              __lasx_xvsle_h(utf16_packed, v_dfff),  // utf16_packed <= 0xdfff
+              __lasx_xvsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+          forbidden_bytemask);
 
-      if (big_endian) {
-        const __m128i swap =
-            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+      if (!match_system(big_endian)) {
+        utf16_packed = lasx_swap_bytes(utf16_packed);
       }
-
-      _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
-      utf16_output += 8;
-      buf += 8;
+      __lasx_xvst(utf16_packed, utf16_output, 0);
+      utf16_output += 16;
+      buf += 16;
     } else {
-      size_t forward = 7;
+      size_t forward = 15;
       size_t k = 0;
       if (size_t(end - buf) < forward + 1) {
         forward = size_t(end - buf - 1);
@@ -37657,25 +52680,25 @@ sse_convert_utf32_to_utf16(const char32_t *buf, size_t len,
         if ((word & 0xFFFF0000) == 0) {
           // will not generate a surrogate pair
           if (word >= 0xD800 && word <= 0xDFFF) {
-            return std::make_pair(nullptr, utf16_output);
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char16_t *>(utf16_output));
           }
-          *utf16_output++ =
-              big_endian
-                  ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
-                  : char16_t(word);
+          *utf16_output++ = !match_system(big_endian)
+                                ? char16_t(word >> 8 | word << 8)
+                                : char16_t(word);
         } else {
           // will generate a surrogate pair
           if (word > 0x10FFFF) {
-            return std::make_pair(nullptr, utf16_output);
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char16_t *>(utf16_output));
           }
           word -= 0x10000;
           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (big_endian) {
+          if (!match_system(big_endian)) {
             high_surrogate =
-                uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-            low_surrogate =
-                uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+                uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+            low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
           }
           *utf16_output++ = char16_t(high_surrogate);
           *utf16_output++ = char16_t(low_surrogate);
@@ -37686,56 +52709,80 @@ sse_convert_utf32_to_utf16(const char32_t *buf, size_t len,
   }
 
   // check for invalid input
-  if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
-    return std::make_pair(nullptr, utf16_output);
+  if (__lasx_xbnz_v(forbidden_bytemask)) {
+    return std::make_pair(nullptr, reinterpret_cast<char16_t *>(utf16_output));
   }
-
-  return std::make_pair(buf, utf16_output);
+  return std::make_pair(buf, reinterpret_cast<char16_t *>(utf16_output));
 }
 
 template <endianness big_endian>
 std::pair<result, char16_t *>
-sse_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
-                                       char16_t *utf16_output) {
+lasx_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
+                                        char16_t *utf16_out) {
+  uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
   const char32_t *start = buf;
   const char32_t *end = buf + len;
 
-  const __m128i v_0000 = _mm_setzero_si128();
-  const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
+  // Performance degradation when memory address is not 32-byte aligned
+  while (((uint64_t)utf16_output & 0x1F) && buf < end) {
+    uint32_t word = *buf++;
+    if ((word & 0xFFFF0000) == 0) {
+      // will not generate a surrogate pair
+      if (word >= 0xD800 && word <= 0xDFFF) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start - 1),
+                              reinterpret_cast<char16_t *>(utf16_output));
+      }
+      *utf16_output++ = !match_system(big_endian)
+                            ? char16_t(word >> 8 | word << 8)
+                            : char16_t(word);
+    } else {
+      // will generate a surrogate pair
+      if (word > 0x10FFFF) {
+        return std::make_pair(result(error_code::TOO_LARGE, buf - start - 1),
+                              reinterpret_cast<char16_t *>(utf16_output));
+      }
+      word -= 0x10000;
+      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+      if (!match_system(big_endian)) {
+        high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+        low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+      }
+      *utf16_output++ = char16_t(high_surrogate);
+      *utf16_output++ = char16_t(low_surrogate);
+    }
+  }
 
-  while (end - buf >= 8) {
-    __m128i in = _mm_loadu_si128((__m128i *)buf);
-    __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
-    const __m128i saturation_bytemask = _mm_cmpeq_epi32(
-        _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+  __m256i forbidden_bytemask = __lasx_xvrepli_h(0);
+  __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
+  __m256i v_dfff = __lasx_xvreplgr2vr_h(uint16_t(0xdfff));
+  while (buf + 16 <= end) {
+    __m256i in0 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m256i in1 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
 
     // Check if no bits set above 16th
-    if (saturation_bitmask == 0xffff) {
-      // Pack UTF-32 to UTF-16
-      __m128i utf16_packed = _mm_packus_epi32(in, nextin);
-
-      const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
-      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
-      const __m128i forbidden_bytemask =
-          _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
+    if (__lasx_xbz_v(__lasx_xvpickod_h(in1, in0))) {
+      __m256i utf16_packed =
+          __lasx_xvpermi_d(__lasx_xvpickev_h(in1, in0), 0b11011000);
+      forbidden_bytemask = __lasx_xvor_v(
+          __lasx_xvand_v(
+              __lasx_xvsle_h(utf16_packed, v_dfff),  // utf16_packed <= 0xdfff
+              __lasx_xvsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+          forbidden_bytemask);
+      if (__lasx_xbnz_v(forbidden_bytemask)) {
         return std::make_pair(result(error_code::SURROGATE, buf - start),
-                              utf16_output);
+                              reinterpret_cast<char16_t *>(utf16_output));
       }
 
-      if (big_endian) {
-        const __m128i swap =
-            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+      if (!match_system(big_endian)) {
+        utf16_packed = lasx_swap_bytes(utf16_packed);
       }
 
-      _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
-      utf16_output += 8;
-      buf += 8;
+      __lasx_xvst(utf16_packed, utf16_output, 0);
+      utf16_output += 16;
+      buf += 16;
     } else {
-      size_t forward = 7;
+      size_t forward = 15;
       size_t k = 0;
       if (size_t(end - buf) < forward + 1) {
         forward = size_t(end - buf - 1);
@@ -37746,26 +52793,26 @@ sse_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
           // will not generate a surrogate pair
           if (word >= 0xD800 && word <= 0xDFFF) {
             return std::make_pair(
-                result(error_code::SURROGATE, buf - start + k), utf16_output);
+                result(error_code::SURROGATE, buf - start + k),
+                reinterpret_cast<char16_t *>(utf16_output));
           }
-          *utf16_output++ =
-              big_endian
-                  ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
-                  : char16_t(word);
+          *utf16_output++ = !match_system(big_endian)
+                                ? char16_t(word >> 8 | word << 8)
+                                : char16_t(word);
         } else {
           // will generate a surrogate pair
           if (word > 0x10FFFF) {
             return std::make_pair(
-                result(error_code::TOO_LARGE, buf - start + k), utf16_output);
+                result(error_code::TOO_LARGE, buf - start + k),
+                reinterpret_cast<char16_t *>(utf16_output));
           }
           word -= 0x10000;
           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (big_endian) {
+          if (!match_system(big_endian)) {
             high_surrogate =
-                uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-            low_surrogate =
-                uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+                uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+            low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
           }
           *utf16_output++ = char16_t(high_surrogate);
           *utf16_output++ = char16_t(low_surrogate);
@@ -37775,10 +52822,11 @@ sse_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
     }
   }
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char16_t *>(utf16_output));
 }
-/* end file src/westmere/sse_convert_utf32_to_utf16.cpp */
-/* begin file src/westmere/sse_base64.cpp */
+/* end file src/lasx/lasx_convert_utf32_to_utf16.cpp */
+/* begin file src/lasx/lasx_base64.cpp */
 /**
  * References and further reading:
  *
@@ -37806,36 +52854,6 @@ sse_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
  * Nick Kopp. 2013. Base64 Encoding on a GPU.
  * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
  */
-template <bool base64_url> __m128i lookup_pshufb_improved(const __m128i input) {
-  // credit: Wojciech Muła
-  // reduce  0..51 -> 0
-  //        52..61 -> 1 .. 10
-  //            62 -> 11
-  //            63 -> 12
-  __m128i result = _mm_subs_epu8(input, _mm_set1_epi8(51));
-
-  // distinguish between ranges 0..25 and 26..51:
-  //         0 .. 25 -> remains 0
-  //        26 .. 51 -> becomes 13
-  const __m128i less = _mm_cmpgt_epi8(_mm_set1_epi8(26), input);
-  result = _mm_or_si128(result, _mm_and_si128(less, _mm_set1_epi8(13)));
-
-  __m128i shift_LUT;
-  if (base64_url) {
-    shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-                              '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-                              '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
-  } else {
-    shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-                              '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-                              '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
-  }
-
-  // read shift
-  result = _mm_shuffle_epi8(shift_LUT, result);
-
-  return _mm_add_epi8(result, input);
-}
 
 template <bool isbase64url>
 size_t encode_base64(char *dst, const char *src, size_t srclen,
@@ -37843,71 +52861,124 @@ size_t encode_base64(char *dst, const char *src, size_t srclen,
   // credit: Wojciech Muła
   // SSE (lookup: pshufb improved unrolled)
   const uint8_t *input = (const uint8_t *)src;
-
+  static const char *lookup_tbl =
+      isbase64url
+          ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
+          : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
   uint8_t *out = (uint8_t *)dst;
-  const __m128i shuf =
-      _mm_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
 
+  v32u8 shuf;
+  __m256i v_fc0fc00, v_3f03f0, shift_r, shift_l, base64_tbl0, base64_tbl1,
+      base64_tbl2, base64_tbl3;
+  if (srclen >= 28) {
+    shuf = v32u8{1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10,
+                 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10};
+
+    v_fc0fc00 = __lasx_xvreplgr2vr_w(uint32_t(0x0fc0fc00));
+    v_3f03f0 = __lasx_xvreplgr2vr_w(uint32_t(0x003f03f0));
+    shift_r = __lasx_xvreplgr2vr_w(uint32_t(0x0006000a));
+    shift_l = __lasx_xvreplgr2vr_w(uint32_t(0x00080004));
+    base64_tbl0 = ____m256i(__lsx_vld(lookup_tbl, 0));
+    base64_tbl1 = ____m256i(__lsx_vld(lookup_tbl, 16));
+    base64_tbl2 = ____m256i(__lsx_vld(lookup_tbl, 32));
+    base64_tbl3 = ____m256i(__lsx_vld(lookup_tbl, 48));
+  }
   size_t i = 0;
-  for (; i + 52 <= srclen; i += 48) {
-    __m128i in0 = _mm_loadu_si128(
-        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
-    __m128i in1 = _mm_loadu_si128(
-        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
-    __m128i in2 = _mm_loadu_si128(
-        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
-    __m128i in3 = _mm_loadu_si128(
-        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
-
-    in0 = _mm_shuffle_epi8(in0, shuf);
-    in1 = _mm_shuffle_epi8(in1, shuf);
-    in2 = _mm_shuffle_epi8(in2, shuf);
-    in3 = _mm_shuffle_epi8(in3, shuf);
-
-    const __m128i t0_0 = _mm_and_si128(in0, _mm_set1_epi32(0x0fc0fc00));
-    const __m128i t0_1 = _mm_and_si128(in1, _mm_set1_epi32(0x0fc0fc00));
-    const __m128i t0_2 = _mm_and_si128(in2, _mm_set1_epi32(0x0fc0fc00));
-    const __m128i t0_3 = _mm_and_si128(in3, _mm_set1_epi32(0x0fc0fc00));
-
-    const __m128i t1_0 = _mm_mulhi_epu16(t0_0, _mm_set1_epi32(0x04000040));
-    const __m128i t1_1 = _mm_mulhi_epu16(t0_1, _mm_set1_epi32(0x04000040));
-    const __m128i t1_2 = _mm_mulhi_epu16(t0_2, _mm_set1_epi32(0x04000040));
-    const __m128i t1_3 = _mm_mulhi_epu16(t0_3, _mm_set1_epi32(0x04000040));
-
-    const __m128i t2_0 = _mm_and_si128(in0, _mm_set1_epi32(0x003f03f0));
-    const __m128i t2_1 = _mm_and_si128(in1, _mm_set1_epi32(0x003f03f0));
-    const __m128i t2_2 = _mm_and_si128(in2, _mm_set1_epi32(0x003f03f0));
-    const __m128i t2_3 = _mm_and_si128(in3, _mm_set1_epi32(0x003f03f0));
-
-    const __m128i t3_0 = _mm_mullo_epi16(t2_0, _mm_set1_epi32(0x01000010));
-    const __m128i t3_1 = _mm_mullo_epi16(t2_1, _mm_set1_epi32(0x01000010));
-    const __m128i t3_2 = _mm_mullo_epi16(t2_2, _mm_set1_epi32(0x01000010));
-    const __m128i t3_3 = _mm_mullo_epi16(t2_3, _mm_set1_epi32(0x01000010));
-
-    const __m128i input0 = _mm_or_si128(t1_0, t3_0);
-    const __m128i input1 = _mm_or_si128(t1_1, t3_1);
-    const __m128i input2 = _mm_or_si128(t1_2, t3_2);
-    const __m128i input3 = _mm_or_si128(t1_3, t3_3);
-
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved<isbase64url>(input0));
-    out += 16;
+  for (; i + 100 <= srclen; i += 96) {
+    __m128i in0_lo =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 0);
+    __m128i in0_hi =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 1);
+    __m128i in1_lo =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 2);
+    __m128i in1_hi =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 3);
+    __m128i in2_lo =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 4);
+    __m128i in2_hi =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 5);
+    __m128i in3_lo =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 6);
+    __m128i in3_hi =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 7);
+
+    __m256i in0 = lasx_set_q(in0_hi, in0_lo);
+    __m256i in1 = lasx_set_q(in1_hi, in1_lo);
+    __m256i in2 = lasx_set_q(in2_hi, in2_lo);
+    __m256i in3 = lasx_set_q(in3_hi, in3_lo);
+
+    in0 = __lasx_xvshuf_b(in0, in0, (__m256i)shuf);
+    in1 = __lasx_xvshuf_b(in1, in1, (__m256i)shuf);
+    in2 = __lasx_xvshuf_b(in2, in2, (__m256i)shuf);
+    in3 = __lasx_xvshuf_b(in3, in3, (__m256i)shuf);
+
+    __m256i t0_0 = __lasx_xvand_v(in0, v_fc0fc00);
+    __m256i t0_1 = __lasx_xvand_v(in1, v_fc0fc00);
+    __m256i t0_2 = __lasx_xvand_v(in2, v_fc0fc00);
+    __m256i t0_3 = __lasx_xvand_v(in3, v_fc0fc00);
+
+    __m256i t1_0 = __lasx_xvsrl_h(t0_0, shift_r);
+    __m256i t1_1 = __lasx_xvsrl_h(t0_1, shift_r);
+    __m256i t1_2 = __lasx_xvsrl_h(t0_2, shift_r);
+    __m256i t1_3 = __lasx_xvsrl_h(t0_3, shift_r);
+
+    __m256i t2_0 = __lasx_xvand_v(in0, v_3f03f0);
+    __m256i t2_1 = __lasx_xvand_v(in1, v_3f03f0);
+    __m256i t2_2 = __lasx_xvand_v(in2, v_3f03f0);
+    __m256i t2_3 = __lasx_xvand_v(in3, v_3f03f0);
+
+    __m256i t3_0 = __lasx_xvsll_h(t2_0, shift_l);
+    __m256i t3_1 = __lasx_xvsll_h(t2_1, shift_l);
+    __m256i t3_2 = __lasx_xvsll_h(t2_2, shift_l);
+    __m256i t3_3 = __lasx_xvsll_h(t2_3, shift_l);
+
+    __m256i input0 = __lasx_xvor_v(t1_0, t3_0);
+    __m256i input0_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input0);
+    __m256i input0_shuf1 = __lasx_xvshuf_b(
+        base64_tbl3, base64_tbl2, __lasx_xvsub_b(input0, __lasx_xvldi(32)));
+    __m256i input0_mask = __lasx_xvslei_bu(input0, 31);
+    __m256i input0_result =
+        __lasx_xvbitsel_v(input0_shuf1, input0_shuf0, input0_mask);
+    __lasx_xvst(input0_result, reinterpret_cast<__m256i *>(out), 0);
+    out += 32;
 
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved<isbase64url>(input1));
-    out += 16;
+    __m256i input1 = __lasx_xvor_v(t1_1, t3_1);
+    __m256i input1_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input1);
+    __m256i input1_shuf1 = __lasx_xvshuf_b(
+        base64_tbl3, base64_tbl2, __lasx_xvsub_b(input1, __lasx_xvldi(32)));
+    __m256i input1_mask = __lasx_xvslei_bu(input1, 31);
+    __m256i input1_result =
+        __lasx_xvbitsel_v(input1_shuf1, input1_shuf0, input1_mask);
+    __lasx_xvst(input1_result, reinterpret_cast<__m256i *>(out), 0);
+    out += 32;
 
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved<isbase64url>(input2));
-    out += 16;
+    __m256i input2 = __lasx_xvor_v(t1_2, t3_2);
+    __m256i input2_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input2);
+    __m256i input2_shuf1 = __lasx_xvshuf_b(
+        base64_tbl3, base64_tbl2, __lasx_xvsub_b(input2, __lasx_xvldi(32)));
+    __m256i input2_mask = __lasx_xvslei_bu(input2, 31);
+    __m256i input2_result =
+        __lasx_xvbitsel_v(input2_shuf1, input2_shuf0, input2_mask);
+    __lasx_xvst(input2_result, reinterpret_cast<__m256i *>(out), 0);
+    out += 32;
 
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved<isbase64url>(input3));
-    out += 16;
+    __m256i input3 = __lasx_xvor_v(t1_3, t3_3);
+    __m256i input3_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input3);
+    __m256i input3_shuf1 = __lasx_xvshuf_b(
+        base64_tbl3, base64_tbl2, __lasx_xvsub_b(input3, __lasx_xvldi(32)));
+    __m256i input3_mask = __lasx_xvslei_bu(input3, 31);
+    __m256i input3_result =
+        __lasx_xvbitsel_v(input3_shuf1, input3_shuf0, input3_mask);
+    __lasx_xvst(input3_result, reinterpret_cast<__m256i *>(out), 0);
+    out += 32;
   }
-  for (; i + 16 <= srclen; i += 12) {
+  for (; i + 28 <= srclen; i += 24) {
 
-    __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
+    __m128i in_lo = __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 0);
+    __m128i in_hi =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 1);
+
+    __m256i in = lasx_set_q(in_hi, in_lo);
 
     // bytes from groups A, B and C are needed in separate 32-bit lanes
     // in = [DDDD|CCCC|BBBB|AAAA]
@@ -37921,40 +52992,43 @@ size_t encode_base64(char *dst, const char *src, size_t srclen,
     //      [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc]
     //           ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^
     //                  processed bits
-    in = _mm_shuffle_epi8(in, shuf);
+    in = __lasx_xvshuf_b(in, in, (__m256i)shuf);
 
     // unpacking
-
     // t0    = [0000cccc|cc000000|aaaaaa00|00000000]
-    const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00));
+    __m256i t0 = __lasx_xvand_v(in, v_fc0fc00);
     // t1    = [00000000|00cccccc|00000000|00aaaaaa]
-    //          (c * (1 << 10), a * (1 << 6)) >> 16 (note: an unsigned
-    //          multiplication)
-    const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
+    //          ((c >> 6),  (a >> 10))
+    __m256i t1 = __lasx_xvsrl_h(t0, shift_r);
 
     // t2    = [00000000|00dddddd|000000bb|bbbb0000]
-    const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0));
-    // t3    = [00dddddd|00000000|00bbbbbb|00000000](
-    //          (d * (1 << 8), b * (1 << 4))
-    const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
+    __m256i t2 = __lasx_xvand_v(in, v_3f03f0);
+    // t3    = [00dddddd|00000000|00bbbbbb|00000000]
+    //          ((d << 8), (b << 4))
+    __m256i t3 = __lasx_xvsll_h(t2, shift_l);
 
     // res   = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3
-    const __m128i indices = _mm_or_si128(t1, t3);
-
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved<isbase64url>(indices));
-    out += 16;
+    __m256i indices = __lasx_xvor_v(t1, t3);
+
+    __m256i indices_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, indices);
+    __m256i indices_shuf1 = __lasx_xvshuf_b(
+        base64_tbl3, base64_tbl2, __lasx_xvsub_b(indices, __lasx_xvldi(32)));
+    __m256i indices_mask = __lasx_xvslei_bu(indices, 31);
+    __m256i indices_result =
+        __lasx_xvbitsel_v(indices_shuf1, indices_shuf0, indices_mask);
+    __lasx_xvst(indices_result, reinterpret_cast<__m256i *>(out), 0);
+    out += 32;
   }
 
   return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
                                                         srclen - i, options);
 }
+
 static inline void compress(__m128i data, uint16_t mask, char *output) {
   if (mask == 0) {
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data);
+    __lsx_vst(data, reinterpret_cast<__m128i *>(output), 0);
     return;
   }
-
   // this particular implementation was inspired by work done by @animetosho
   // we do it in two steps, first 8 bytes and then second 8 bytes
   uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
@@ -37963,13 +53037,15 @@ static inline void compress(__m128i data, uint16_t mask, char *output) {
   // thintable_epi8[mask2] into a 128-bit register, using only
   // two instructions on most compilers.
 
-  __m128i shufmask = _mm_set_epi64x(tables::base64::thintable_epi8[mask2],
-                                    tables::base64::thintable_epi8[mask1]);
+  v2u64 shufmask = {tables::base64::thintable_epi8[mask1],
+                    tables::base64::thintable_epi8[mask2]};
+
   // we increment by 0x08 the second half of the mask
-  shufmask =
-      _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
+  const v4u32 hi = {0, 0, 0x08080808, 0x08080808};
+  __m128i shufmask1 = __lsx_vadd_b((__m128i)shufmask, (__m128i)hi);
+
   // this is the version "nearly pruned"
-  __m128i pruned = _mm_shuffle_epi8(data, shufmask);
+  __m128i pruned = __lsx_vshuf_b(data, data, shufmask1);
   // we still need to put the two halves together.
   // we compute the popcount of the first half:
   int pop1 = tables::base64::BitsSetTable256mul2[mask1];
@@ -37977,212 +53053,185 @@ static inline void compress(__m128i data, uint16_t mask, char *output) {
   // only the first pop1 bytes from the first 8 bytes, and then
   // it fills in with the bytes from the second 8 bytes + some filling
   // at the end.
-  __m128i compactmask = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
-      tables::base64::pshufb_combine_table + pop1 * 8));
-  __m128i answer = _mm_shuffle_epi8(pruned, compactmask);
-  _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
+  __m128i compactmask =
+      __lsx_vld(reinterpret_cast<const __m128i *>(
+                    tables::base64::pshufb_combine_table + pop1 * 8),
+                0);
+  __m128i answer = __lsx_vshuf_b(pruned, pruned, compactmask);
+
+  __lsx_vst(answer, reinterpret_cast<__m128i *>(output), 0);
 }
 
 struct block64 {
-  __m128i chunks[4];
+  __m256i chunks[2];
 };
 
 template <bool base64_url>
-static inline uint16_t to_base64_mask(__m128i *src, uint32_t *error) {
-  const __m128i ascii_space_tbl =
-      _mm_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, 0x0,
-                    0xc, 0xd, 0x0, 0x0);
+static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
+  __m256i ascii_space_tbl =
+      ____m256i((__m128i)v16u8{0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                               0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0});
   // credit: aqrit
-  __m128i delta_asso;
+  __m256i delta_asso =
+      ____m256i((__m128i)v16u8{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0,
+                               0x0, 0x0, 0x0, 0xF, 0x0, 0xF});
+  __m256i delta_values;
   if (base64_url) {
-    delta_asso = _mm_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0,
-                               0x0, 0x0, 0x0, 0xF, 0x0, 0xF);
+    delta_values = ____m256i(
+        (__m128i)v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+                       int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+                       int8_t(0xB9), int8_t(0x00), int8_t(0x11), int8_t(0xC3),
+                       int8_t(0xBF), int8_t(0xE0), int8_t(0xB9), int8_t(0xB9)});
   } else {
-
-    delta_asso = _mm_setr_epi8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
+    delta_values = ____m256i(
+        (__m128i)v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+                       int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+                       int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
+                       int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9)});
   }
-  __m128i delta_values;
-  if (base64_url) {
-    delta_values = _mm_setr_epi8(0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
-                                 uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9),
-                                 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF),
-                                 uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
-  } else {
 
-    delta_values =
-        _mm_setr_epi8(int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
-                      int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
-                      int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
-                      int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9));
-  }
-  __m128i check_asso;
+  __m256i check_asso;
   if (base64_url) {
-    check_asso = _mm_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
-                               0x3, 0x7, 0xB, 0xE, 0xB, 0x6);
+    check_asso = ____m256i((__m128i)v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                          0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
+                                          0x0B, 0x06, 0x0B, 0x12});
   } else {
-
-    check_asso = _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                               0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
+    check_asso = ____m256i((__m128i)v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                          0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
+                                          0x0B, 0x0B, 0x0B, 0x0F});
   }
-  __m128i check_values;
+
+  __m256i check_values;
   if (base64_url) {
-    check_values = _mm_setr_epi8(uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
-                                 uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF),
-                                 uint8_t(0xB6), uint8_t(0xA6), uint8_t(0xB5),
-                                 uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0,
-                                 uint8_t(0x80), 0x0, uint8_t(0x80));
+    check_values = ____m256i(
+        (__m128i)v16i8{int8_t(0x0), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+                       int8_t(0xCF), int8_t(0xBF), int8_t(0xD3), int8_t(0xA6),
+                       int8_t(0xB5), int8_t(0x86), int8_t(0xD0), int8_t(0x80),
+                       int8_t(0xB0), int8_t(0x80), int8_t(0x0), int8_t(0x0)});
   } else {
-
-    check_values =
-        _mm_setr_epi8(int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
-                      int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
-                      int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
-                      int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80));
-  }
-  const __m128i shifted = _mm_srli_epi32(*src, 3);
-
-  const __m128i delta_hash =
-      _mm_avg_epu8(_mm_shuffle_epi8(delta_asso, *src), shifted);
-  const __m128i check_hash =
-      _mm_avg_epu8(_mm_shuffle_epi8(check_asso, *src), shifted);
-
-  const __m128i out =
-      _mm_adds_epi8(_mm_shuffle_epi8(delta_values, delta_hash), *src);
-  const __m128i chk =
-      _mm_adds_epi8(_mm_shuffle_epi8(check_values, check_hash), *src);
-  const int mask = _mm_movemask_epi8(chk);
+    check_values = ____m256i(
+        (__m128i)v16i8{int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+                       int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
+                       int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
+                       int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80)});
+  }
+
+  __m256i shifted = __lasx_xvsrli_b(*src, 3);
+  __m256i asso_index = __lasx_xvand_v(*src, __lasx_xvldi(0xF));
+  __m256i delta_hash = __lasx_xvavgr_bu(
+      __lasx_xvshuf_b(delta_asso, delta_asso, asso_index), shifted);
+  __m256i check_hash = __lasx_xvavgr_bu(
+      __lasx_xvshuf_b(check_asso, check_asso, asso_index), shifted);
+
+  __m256i out = __lasx_xvsadd_b(
+      __lasx_xvshuf_b(delta_values, delta_values, delta_hash), *src);
+  __m256i chk = __lasx_xvsadd_b(
+      __lasx_xvshuf_b(check_values, check_values, check_hash), *src);
+  __m256i chk_ltz = __lasx_xvmskltz_b(chk);
+  unsigned int mask = __lasx_xvpickve2gr_wu(chk_ltz, 0);
+  mask = mask | (__lsx_vpickve2gr_hu(lasx_extracti128_hi(chk_ltz), 0) << 16);
   if (mask) {
-    __m128i ascii_space =
-        _mm_cmpeq_epi8(_mm_shuffle_epi8(ascii_space_tbl, *src), *src);
-    *error = (mask ^ _mm_movemask_epi8(ascii_space));
+    __m256i ascii_space = __lasx_xvseq_b(
+        __lasx_xvshuf_b(ascii_space_tbl, ascii_space_tbl, asso_index), *src);
+    __m256i ascii_space_ltz = __lasx_xvmskltz_b(ascii_space);
+    unsigned int ascii_space_mask = __lasx_xvpickve2gr_wu(ascii_space_ltz, 0);
+    ascii_space_mask =
+        ascii_space_mask |
+        (__lsx_vpickve2gr_hu(lasx_extracti128_hi(ascii_space_ltz), 0) << 16);
+    *error |= (mask != ascii_space_mask);
   }
+
   *src = out;
-  return (uint16_t)mask;
+  return (uint32_t)mask;
 }
 
 template <bool base64_url>
-static inline uint64_t to_base64_mask(block64 *b, uint64_t *error) {
-  uint32_t err0 = 0;
-  uint32_t err1 = 0;
-  uint32_t err2 = 0;
-  uint32_t err3 = 0;
-  uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], &err0);
-  uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], &err1);
-  uint64_t m2 = to_base64_mask<base64_url>(&b->chunks[2], &err2);
-  uint64_t m3 = to_base64_mask<base64_url>(&b->chunks[3], &err3);
-  *error = (err0) | ((uint64_t)err1 << 16) | ((uint64_t)err2 << 32) |
-           ((uint64_t)err3 << 48);
-  return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48);
-}
-
-#if defined(_MSC_VER) && !defined(__clang__)
-static inline size_t simdutf_tzcnt_u64(uint64_t num) {
-  unsigned long ret;
-  if (num == 0) {
-    return 64;
-  }
-  _BitScanForward64(&ret, num);
-  return ret;
-}
-#else // GCC or Clang
-static inline size_t simdutf_tzcnt_u64(uint64_t num) {
-  return num ? __builtin_ctzll(num) : 64;
+static inline uint64_t to_base64_mask(block64 *b, bool *error) {
+  *error = 0;
+  uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], error);
+  uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], error);
+  return m0 | (m1 << 32);
 }
-#endif
 
 static inline void copy_block(block64 *b, char *output) {
-  _mm_storeu_si128(reinterpret_cast<__m128i *>(output), b->chunks[0]);
-  _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 16), b->chunks[1]);
-  _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 32), b->chunks[2]);
-  _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 48), b->chunks[3]);
+  __lasx_xvst(b->chunks[0], reinterpret_cast<__m256i *>(output), 0);
+  __lasx_xvst(b->chunks[1], reinterpret_cast<__m256i *>(output), 32);
 }
 
 static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
   uint64_t nmask = ~mask;
-  compress(b->chunks[0], uint16_t(mask), output);
-  compress(b->chunks[1], uint16_t(mask >> 16),
-           output + _mm_popcnt_u64(nmask & 0xFFFF));
-  compress(b->chunks[2], uint16_t(mask >> 32),
-           output + _mm_popcnt_u64(nmask & 0xFFFFFFFF));
-  compress(b->chunks[3], uint16_t(mask >> 48),
-           output + _mm_popcnt_u64(nmask & 0xFFFFFFFFFFFFULL));
-  return _mm_popcnt_u64(nmask);
+  uint64_t count =
+      __lsx_vpickve2gr_d(__lsx_vpcnt_h(__lsx_vreplgr2vr_d(nmask)), 0);
+  uint16_t *count_ptr = (uint16_t *)&count;
+  compress(lasx_extracti128_lo(b->chunks[0]), uint16_t(mask), output);
+  compress(lasx_extracti128_hi(b->chunks[0]), uint16_t(mask >> 16),
+           output + count_ptr[0]);
+  compress(lasx_extracti128_lo(b->chunks[1]), uint16_t(mask >> 32),
+           output + count_ptr[0] + count_ptr[1]);
+  compress(lasx_extracti128_hi(b->chunks[1]), uint16_t(mask >> 48),
+           output + count_ptr[0] + count_ptr[1] + count_ptr[2]);
+  return count_ones(nmask);
 }
 
 // The caller of this function is responsible to ensure that there are 64 bytes
 // available from reading at src. The data is read into a block64 structure.
 static inline void load_block(block64 *b, const char *src) {
-  b->chunks[0] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
-  b->chunks[1] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16));
-  b->chunks[2] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32));
-  b->chunks[3] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48));
+  b->chunks[0] = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 0);
+  b->chunks[1] = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 32);
 }
 
 // The caller of this function is responsible to ensure that there are 128 bytes
 // available from reading at src. The data is read into a block64 structure.
 static inline void load_block(block64 *b, const char16_t *src) {
-  __m128i m1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
-  __m128i m2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 8));
-  __m128i m3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16));
-  __m128i m4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 24));
-  __m128i m5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32));
-  __m128i m6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 40));
-  __m128i m7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48));
-  __m128i m8 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 56));
-  b->chunks[0] = _mm_packus_epi16(m1, m2);
-  b->chunks[1] = _mm_packus_epi16(m3, m4);
-  b->chunks[2] = _mm_packus_epi16(m5, m6);
-  b->chunks[3] = _mm_packus_epi16(m7, m8);
+  __m256i m1 = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 0);
+  __m256i m2 = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 32);
+  __m256i m3 = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 64);
+  __m256i m4 = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 96);
+  b->chunks[0] = __lasx_xvpermi_d(__lasx_xvssrlni_bu_h(m2, m1, 0), 0b11011000);
+  b->chunks[1] = __lasx_xvpermi_d(__lasx_xvssrlni_bu_h(m4, m3, 0), 0b11011000);
 }
 
-static inline void base64_decode(char *out, __m128i str) {
-  // credit: aqrit
-
-  const __m128i pack_shuffle =
-      _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1);
+static inline void base64_decode(char *out, __m256i str) {
+  __m256i t0 = __lasx_xvor_v(
+      __lasx_xvslli_w(str, 26),
+      __lasx_xvslli_w(__lasx_xvand_v(str, __lasx_xvldi(-1758 /*0x0000FF00*/)),
+                      12));
+  __m256i t1 = __lasx_xvsrli_w(
+      __lasx_xvand_v(str, __lasx_xvldi(-3521 /*0x003F0000*/)), 2);
+  __m256i t2 = __lasx_xvor_v(t0, t1);
+  __m256i t3 = __lasx_xvor_v(t2, __lasx_xvsrli_w(str, 16));
+  __m256i pack_shuffle = ____m256i(
+      (__m128i)v16u8{3, 2, 1, 7, 6, 5, 11, 10, 9, 15, 14, 13, 0, 0, 0, 0});
+  t3 = __lasx_xvshuf_b(t3, t3, (__m256i)pack_shuffle);
 
-  const __m128i t0 = _mm_maddubs_epi16(str, _mm_set1_epi32(0x01400140));
-  const __m128i t1 = _mm_madd_epi16(t0, _mm_set1_epi32(0x00011000));
-  const __m128i t2 = _mm_shuffle_epi8(t1, pack_shuffle);
   // Store the output:
-  // this writes 16 bytes, but we only need 12.
-  _mm_storeu_si128((__m128i *)out, t2);
+  __lsx_vst(lasx_extracti128_lo(t3), out, 0);
+  __lsx_vst(lasx_extracti128_hi(t3), out, 12);
 }
 // decode 64 bytes and output 48 bytes
 static inline void base64_decode_block(char *out, const char *src) {
-  base64_decode(out, _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
-  base64_decode(out + 12,
-                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16)));
+  base64_decode(out, __lasx_xvld(reinterpret_cast<const __m256i *>(src), 0));
   base64_decode(out + 24,
-                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32)));
-  base64_decode(out + 36,
-                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48)));
+                __lasx_xvld(reinterpret_cast<const __m256i *>(src), 32));
 }
+
 static inline void base64_decode_block_safe(char *out, const char *src) {
-  base64_decode(out, _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
-  base64_decode(out + 12,
-                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16)));
-  base64_decode(out + 24,
-                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32)));
-  char buffer[16];
+  base64_decode(out, __lasx_xvld(reinterpret_cast<const __m256i *>(src), 0));
+  char buffer[32];
   base64_decode(buffer,
-                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48)));
-  std::memcpy(out + 36, buffer, 12);
+                __lasx_xvld(reinterpret_cast<const __m256i *>(src), 32));
+  std::memcpy(out + 24, buffer, 24);
 }
+
 static inline void base64_decode_block(char *out, block64 *b) {
   base64_decode(out, b->chunks[0]);
-  base64_decode(out + 12, b->chunks[1]);
-  base64_decode(out + 24, b->chunks[2]);
-  base64_decode(out + 36, b->chunks[3]);
+  base64_decode(out + 24, b->chunks[1]);
 }
 static inline void base64_decode_block_safe(char *out, block64 *b) {
   base64_decode(out, b->chunks[0]);
-  base64_decode(out + 12, b->chunks[1]);
-  base64_decode(out + 24, b->chunks[2]);
-  char buffer[16];
-  base64_decode(buffer, b->chunks[3]);
-  std::memcpy(out + 36, buffer, 12);
+  char buffer[32];
+  base64_decode(buffer, b->chunks[1]);
+  std::memcpy(out + 24, buffer, 24);
 }
 
 template <bool base64_url, typename chartype>
@@ -38229,7 +53278,7 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen,
   const chartype *const srcend = src + srclen;
 
   constexpr size_t block_size = 6;
-  static_assert(block_size >= 2, "block should of size 2 or more");
+  static_assert(block_size >= 2, "block_size must be at least two");
   char buffer[block_size * 64];
   char *bufferptr = buffer;
   if (srclen >= 64) {
@@ -38238,13 +53287,16 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen,
       block64 b;
       load_block(&b, src);
       src += 64;
-      uint64_t error = 0;
+      bool error = false;
       uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
       if (error) {
         src -= 64;
-        size_t error_offset = simdutf_tzcnt_u64(error);
-        return {error_code::INVALID_BASE64_CHARACTER,
-                size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
+        while (src < srcend && scalar::base64::is_eight_byte(*src) &&
+               to_base64[uint8_t(*src)] <= 64) {
+          src++;
+        }
+        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+                size_t(dst - dstinit)};
       }
       if (badcharmask != 0) {
         // optimization opportunity: check for simple masks like those made of
@@ -38285,6 +53337,7 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen,
   // time, otherwise, we should just decode directly.
   int last_block = (int)((bufferptr - buffer_start) % 64);
   if (last_block != 0 && srcend - src + last_block >= 64) {
+
     while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
       uint8_t val = to_base64[uint8_t(*src)];
       *bufferptr = char(val);
@@ -38370,15 +53423,15 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen,
   }
   return {SUCCESS, srclen, size_t(dst - dstinit)};
 }
-/* end file src/westmere/sse_base64.cpp */
+/* end file src/lasx/lasx_base64.cpp */
 
-} // unnamed namespace
-} // namespace westmere
+} // namespace
+} // namespace lasx
 } // namespace simdutf
 
 /* begin file src/generic/buf_block_reader.h */
 namespace simdutf {
-namespace westmere {
+namespace lasx {
 namespace {
 
 // Walks through a buffer in block-sized increments, loading the last part with
@@ -38484,12 +53537,12 @@ simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
 }
 
 } // unnamed namespace
-} // namespace westmere
+} // namespace lasx
 } // namespace simdutf
 /* end file src/generic/buf_block_reader.h */
 /* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
 namespace simdutf {
-namespace westmere {
+namespace lasx {
 namespace {
 namespace utf8_validation {
 
@@ -38709,12 +53762,12 @@ struct utf8_checker {
 using utf8_validation::utf8_checker;
 
 } // unnamed namespace
-} // namespace westmere
+} // namespace lasx
 } // namespace simdutf
 /* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
 /* begin file src/generic/utf8_validation/utf8_validator.h */
 namespace simdutf {
-namespace westmere {
+namespace lasx {
 namespace {
 namespace utf8_validation {
 
@@ -38843,103 +53896,31 @@ result generic_validate_ascii_with_errors(const uint8_t *input, size_t length) {
 }
 
 result generic_validate_ascii_with_errors(const char *input, size_t length) {
-  return generic_validate_ascii_with_errors<utf8_checker>(
-      reinterpret_cast<const uint8_t *>(input), length);
-}
-
-} // namespace utf8_validation
-} // unnamed namespace
-} // namespace westmere
-} // namespace simdutf
-/* end file src/generic/utf8_validation/utf8_validator.h */
-// transcoding from UTF-8 to UTF-16
-/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
-
-namespace simdutf {
-namespace westmere {
-namespace {
-namespace utf8_to_utf16 {
-
-using namespace simd;
-
-template <endianness endian>
-simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
-                                         char16_t *utf16_output) noexcept {
-  // The implementation is not specific to haswell and should be moved to the
-  // generic directory.
-  size_t pos = 0;
-  char16_t *start{utf16_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while (pos + 64 + safety_margin <= size) {
-    // this loop could be unrolled further. For example, we could process the
-    // mask far more than 64 bytes.
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if (in.is_ascii()) {
-      in.store_ascii_as_utf16<endian>(utf16_output);
-      utf16_output += 64;
-      pos += 64;
-    } else {
-      // Slow path. We hope that the compiler will recognize that this is a slow
-      // path. Anything that is not a continuation mask is a 'leading byte',
-      // that is, the start of a new code point.
-      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-      // -65 is 0b10111111 in two-complement's, so largest possible continuation
-      // byte
-      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-      // The *start* of code points is not so useful, rather, we want the *end*
-      // of code points.
-      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-      // We process in blocks of up to 12 bytes except possibly
-      // for fast paths which may process up to 16 bytes. For the
-      // slow path to work, we should have at least 12 input bytes left.
-      size_t max_starting_point = (pos + 64) - 12;
-      // Next loop is going to run at least five times when using solely
-      // the slow/regular path, and at least four times if there are fast paths.
-      while (pos < max_starting_point) {
-        // Performance note: our ability to compute 'consumed' and
-        // then shift and recompute is critical. If there is a
-        // latency of, say, 4 cycles on getting 'consumed', then
-        // the inner loop might have a total latency of about 6 cycles.
-        // Yet we process between 6 to 12 inputs bytes, thus we get
-        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-        // for this section of the code. Hence, there is a limit
-        // to how much we can further increase this latency before
-        // it seriously harms performance.
-        //
-        // Thus we may allow convert_masked_utf8_to_utf16 to process
-        // more bytes at a time under a fast-path mode where 16 bytes
-        // are consumed at once (e.g., when encountering ASCII).
-        size_t consumed = convert_masked_utf8_to_utf16<endian>(
-            input + pos, utf8_end_of_code_point_mask, utf16_output);
-        pos += consumed;
-        utf8_end_of_code_point_mask >>= consumed;
-      }
-      // At this point there may remain between 0 and 12 bytes in the
-      // 64-byte block. These bytes will be processed again. So we have an
-      // 80% efficiency (in the worst case). In practice we expect an
-      // 85% to 90% efficiency.
-    }
-  }
-  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
-      input + pos, size - pos, utf16_output);
-  return utf16_output - start;
+  return generic_validate_ascii_with_errors<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
 }
 
-} // namespace utf8_to_utf16
+} // namespace utf8_validation
 } // unnamed namespace
-} // namespace westmere
+} // namespace lasx
 } // namespace simdutf
-/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
-/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+/* end file src/generic/utf8_validation/utf8_validator.h */
+
+// transcoding from UTF-8 to Latin 1
+/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
 
 namespace simdutf {
-namespace westmere {
+namespace lasx {
 namespace {
-namespace utf8_to_utf16 {
+namespace utf8_to_latin1 {
 using namespace simd;
 
 simdutf_really_inline simd8<uint8_t>
 check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // For UTF-8 to Latin 1, we can allow any ASCII character, and any
+  // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
+  // 0b11000010 and nothing else.
+  //
   // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
   // Bit 1 = Too Long (ASCII followed by continuation)
   // Bit 2 = Overlong 3-byte
@@ -38966,6 +53947,7 @@ check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
   // 1111011_ 1000____
   // 11111___ 1000____
   constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+  constexpr const uint8_t FORBIDDEN = 0xff;
 
   const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
       // 0_______ ________ <ASCII in byte 1>
@@ -38976,11 +53958,11 @@ check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
       // 1100____ ________ <two byte lead in byte 1>
       TOO_SHORT | OVERLONG_2,
       // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
+      FORBIDDEN,
       // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      FORBIDDEN,
       // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+      FORBIDDEN);
   constexpr const uint8_t CARRY =
       TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
   const simd8<uint8_t> byte_1_low =
@@ -38994,23 +53976,16 @@ check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
               CARRY, CARRY,
 
               // ____0100 ________
-              CARRY | TOO_LARGE,
+              FORBIDDEN,
               // ____0101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              FORBIDDEN,
               // ____011_ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              FORBIDDEN, FORBIDDEN,
 
               // ____1___ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
               // ____1101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000);
+              FORBIDDEN, FORBIDDEN, FORBIDDEN);
   const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
       // ________ 0_______ <ASCII in byte 2>
       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
@@ -39029,17 +54004,6 @@ check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
   return (byte_1_high & byte_1_low & byte_2_high);
 }
-simdutf_really_inline simd8<uint8_t>
-check_multibyte_lengths(const simd8<uint8_t> input,
-                        const simd8<uint8_t> prev_input,
-                        const simd8<uint8_t> sc) {
-  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
-  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
-  simd8<uint8_t> must23 =
-      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
-  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
-  return must23_80 ^ sc;
-}
 
 struct validating_transcoder {
   // If this is nonzero, there has been a UTF-8 error.
@@ -39055,25 +54019,24 @@ struct validating_transcoder {
     // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
     // small negative numbers)
     simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-    simd8<uint8_t> sc = check_special_cases(input, prev1);
-    this->error |= check_multibyte_lengths(input, prev_input, sc);
+    this->error |= check_special_cases(input, prev1);
   }
 
-  template <endianness endian>
   simdutf_really_inline size_t convert(const char *in, size_t size,
-                                       char16_t *utf16_output) {
+                                       char *latin1_output) {
     size_t pos = 0;
-    char16_t *start{utf16_output};
+    char *start{latin1_output};
     // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
+    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
     // last 16 bytes, and if the data is valid, then it is entirely safe because
     // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
     // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 8 leading bytes, to give us a good margin.
+    // back from the end counting 16 leading bytes, to give us a good margin.
     size_t leading_byte = 0;
     size_t margin = size;
-    for (; margin > 0 && leading_byte < 8; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) > -65);
+    for (; margin > 0 && leading_byte < 16; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) >
+                       -65); // twos complement of -65 is 1011 1111 ...
     }
     // If the input is long enough, then we have that margin-1 is the eight last
     // leading byte.
@@ -39081,8 +54044,8 @@ struct validating_transcoder {
     while (pos + 64 + safety_margin <= size) {
       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
       if (input.is_ascii()) {
-        input.store_ascii_as_utf16<endian>(utf16_output);
-        utf16_output += 64;
+        input.store((int8_t *)latin1_output);
+        latin1_output += 64;
         pos += 64;
       } else {
         // you might think that a for-loop would work, but under Visual Studio,
@@ -39101,10 +54064,9 @@ struct validating_transcoder {
           this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
           this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
         }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        if (utf8_continuation_mask & 1) {
-          return 0; // error
-        }
+        uint64_t utf8_continuation_mask =
+            input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
+                               // this case, we also have ASCII to account for.
         uint64_t utf8_leading_mask = ~utf8_continuation_mask;
         uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
         // We process in blocks of up to 12 bytes except possibly
@@ -39122,8 +54084,8 @@ struct validating_transcoder {
           // for this section of the code. Hence, there is a limit
           // to how much we can further increase this latency before
           // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf16<endian>(
-              in + pos, utf8_end_of_code_point_mask, utf16_output);
+          size_t consumed = convert_masked_utf8_to_latin1(
+              in + pos, utf8_end_of_code_point_mask, latin1_output);
           pos += consumed;
           utf8_end_of_code_point_mask >>= consumed;
         }
@@ -39137,23 +54099,22 @@ struct validating_transcoder {
       return 0;
     }
     if (pos < size) {
-      size_t howmany = scalar::utf8_to_utf16::convert<endian>(
-          in + pos, size - pos, utf16_output);
+      size_t howmany =
+          scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
       if (howmany == 0) {
         return 0;
       }
-      utf16_output += howmany;
+      latin1_output += howmany;
     }
-    return utf16_output - start;
+    return latin1_output - start;
   }
 
-  template <endianness endian>
   simdutf_really_inline result convert_with_errors(const char *in, size_t size,
-                                                   char16_t *utf16_output) {
+                                                   char *latin1_output) {
     size_t pos = 0;
-    char16_t *start{utf16_output};
+    char *start{latin1_output};
     // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
+    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
     // last 16 bytes, and if the data is valid, then it is entirely safe because
     // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
     // generally assume that you have valid UTF-8 input, so we are going to go
@@ -39169,8 +54130,8 @@ struct validating_transcoder {
     while (pos + 64 + safety_margin <= size) {
       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
       if (input.is_ascii()) {
-        input.store_ascii_as_utf16<endian>(utf16_output);
-        utf16_output += 64;
+        input.store((int8_t *)latin1_output);
+        latin1_output += 64;
         pos += 64;
       } else {
         // you might think that a for-loop would work, but under Visual Studio,
@@ -39189,17 +54150,16 @@ struct validating_transcoder {
           this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
           this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
         }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        if (errors() || (utf8_continuation_mask & 1)) {
+        if (errors()) {
           // rewind_and_convert_with_errors will seek a potential error from
           // in+pos onward, with the ability to go back up to pos bytes, and
           // read size-pos bytes forward.
-          result res =
-              scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
-                  pos, in + pos, size - pos, utf16_output);
+          result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+              pos, in + pos, size - pos, latin1_output);
           res.count += pos;
           return res;
         }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
         uint64_t utf8_leading_mask = ~utf8_continuation_mask;
         uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
         // We process in blocks of up to 12 bytes except possibly
@@ -39217,8 +54177,8 @@ struct validating_transcoder {
           // for this section of the code. Hence, there is a limit
           // to how much we can further increase this latency before
           // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf16<endian>(
-              in + pos, utf8_end_of_code_point_mask, utf16_output);
+          size_t consumed = convert_masked_utf8_to_latin1(
+              in + pos, utf8_end_of_code_point_mask, latin1_output);
           pos += consumed;
           utf8_end_of_code_point_mask >>= consumed;
         }
@@ -39232,9 +54192,8 @@ struct validating_transcoder {
       // rewind_and_convert_with_errors will seek a potential error from in+pos
       // onward, with the ability to go back up to pos bytes, and read size-pos
       // bytes forward.
-      result res =
-          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
-              pos, in + pos, size - pos, utf16_output);
+      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, latin1_output);
       res.count += pos;
       return res;
     }
@@ -39242,17 +54201,16 @@ struct validating_transcoder {
       // rewind_and_convert_with_errors will seek a potential error from in+pos
       // onward, with the ability to go back up to pos bytes, and read size-pos
       // bytes forward.
-      result res =
-          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
-              pos, in + pos, size - pos, utf16_output);
+      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, latin1_output);
       if (res.error) { // In case of error, we want the error position
         res.count += pos;
         return res;
       } else { // In case of success, we want the number of word written
-        utf16_output += res.count;
+        latin1_output += res.count;
       }
     }
-    return result(error_code::SUCCESS, utf16_output - start);
+    return result(error_code::SUCCESS, latin1_output - start);
   }
 
   simdutf_really_inline bool errors() const {
@@ -39260,63 +54218,176 @@ struct validating_transcoder {
   }
 
 }; // struct utf8_checker
-} // namespace utf8_to_utf16
+} // namespace utf8_to_latin1
 } // unnamed namespace
-} // namespace westmere
+} // namespace lasx
 } // namespace simdutf
-/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
-// transcoding from UTF-8 to UTF-32
-/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
+/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
+/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
 
 namespace simdutf {
-namespace westmere {
+namespace lasx {
 namespace {
-namespace utf8_to_utf32 {
+namespace utf8_to_latin1 {
+using namespace simd;
+
+simdutf_really_inline size_t convert_valid(const char *in, size_t size,
+                                           char *latin1_output) {
+  size_t pos = 0;
+  char *start{latin1_output};
+  // In the worst case, we have the haswell kernel which can cause an overflow
+  // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
+  // 16 bytes, and if the data is valid, then it is entirely safe because 16
+  // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
+  // assume that you have valid UTF-8 input, so we are going to go back from the
+  // end counting 8 leading bytes, to give us a good margin.
+  size_t leading_byte = 0;
+  size_t margin = size;
+  for (; margin > 0 && leading_byte < 8; margin--) {
+    leading_byte += (int8_t(in[margin - 1]) >
+                     -65); // twos complement of -65 is 1011 1111 ...
+  }
+  // If the input is long enough, then we have that margin-1 is the eight last
+  // leading byte.
+  const size_t safety_margin = size - margin + 1; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    if (input.is_ascii()) {
+      input.store((int8_t *)latin1_output);
+      latin1_output += 64;
+      pos += 64;
+    } else {
+      // you might think that a for-loop would work, but under Visual Studio, it
+      // is not good enough.
+      uint64_t utf8_continuation_mask =
+          input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
+                             // this case, we also have ASCII to account for.
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times.
+      while (pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        size_t consumed = convert_masked_utf8_to_latin1(
+            in + pos, utf8_end_of_code_point_mask, latin1_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block. These bytes will be processed again. So we have an
+      // 80% efficiency (in the worst case). In practice we expect an
+      // 85% to 90% efficiency.
+    }
+  }
+  if (pos < size) {
+    size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
+                                                           latin1_output);
+    latin1_output += howmany;
+  }
+  return latin1_output - start;
+}
+
+} // namespace utf8_to_latin1
+} // namespace
+} // namespace lasx
+} // namespace simdutf
+  // namespace simdutf
+/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
+// transcoding from UTF-8 to UTF-16
+/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+
+namespace simdutf {
+namespace lasx {
+namespace {
+namespace utf8_to_utf16 {
 
 using namespace simd;
 
+template <endianness endian>
 simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
-                                         char32_t *utf32_output) noexcept {
+                                         char16_t *utf16_output) noexcept {
+  // The implementation is not specific to haswell and should be moved to the
+  // generic directory.
   size_t pos = 0;
-  char32_t *start{utf32_output};
+  char16_t *start{utf16_output};
   const size_t safety_margin = 16; // to avoid overruns!
   while (pos + 64 + safety_margin <= size) {
+    // this loop could be unrolled further. For example, we could process the
+    // mask far more than 64 bytes.
     simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
     if (in.is_ascii()) {
-      in.store_ascii_as_utf32(utf32_output);
-      utf32_output += 64;
+      in.store_ascii_as_utf16<endian>(utf16_output);
+      utf16_output += 64;
       pos += 64;
     } else {
+      // Slow path. We hope that the compiler will recognize that this is a slow
+      // path. Anything that is not a continuation mask is a 'leading byte',
+      // that is, the start of a new code point.
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
       // -65 is 0b10111111 in two-complement's, so largest possible continuation
       // byte
-      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
       uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      // The *start* of code points is not so useful, rather, we want the *end*
+      // of code points.
       uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
       size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times when using solely
+      // the slow/regular path, and at least four times if there are fast paths.
       while (pos < max_starting_point) {
-        size_t consumed = convert_masked_utf8_to_utf32(
-            input + pos, utf8_end_of_code_point_mask, utf32_output);
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        //
+        // Thus we may allow convert_masked_utf8_to_utf16 to process
+        // more bytes at a time under a fast-path mode where 16 bytes
+        // are consumed at once (e.g., when encountering ASCII).
+        size_t consumed = convert_masked_utf8_to_utf16<endian>(
+            input + pos, utf8_end_of_code_point_mask, utf16_output);
         pos += consumed;
         utf8_end_of_code_point_mask >>= consumed;
       }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block. These bytes will be processed again. So we have an
+      // 80% efficiency (in the worst case). In practice we expect an
+      // 85% to 90% efficiency.
     }
   }
-  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
-                                                       utf32_output);
-  return utf32_output - start;
+  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
+      input + pos, size - pos, utf16_output);
+  return utf16_output - start;
 }
 
-} // namespace utf8_to_utf32
+} // namespace utf8_to_utf16
 } // unnamed namespace
-} // namespace westmere
+} // namespace lasx
 } // namespace simdutf
-/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
-/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
+/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 
 namespace simdutf {
-namespace westmere {
+namespace lasx {
 namespace {
-namespace utf8_to_utf32 {
+namespace utf8_to_utf16 {
 using namespace simd;
 
 simdutf_really_inline simd8<uint8_t>
@@ -39440,29 +54511,30 @@ struct validating_transcoder {
     this->error |= check_multibyte_lengths(input, prev_input, sc);
   }
 
+  template <endianness endian>
   simdutf_really_inline size_t convert(const char *in, size_t size,
-                                       char32_t *utf32_output) {
+                                       char16_t *utf16_output) {
     size_t pos = 0;
-    char32_t *start{utf32_output};
+    char16_t *start{utf16_output};
     // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
+    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
     // last 16 bytes, and if the data is valid, then it is entirely safe because
     // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
     // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 16 leading bytes, to give us a good margin.
+    // back from the end counting 8 leading bytes, to give us a good margin.
     size_t leading_byte = 0;
     size_t margin = size;
     for (; margin > 0 && leading_byte < 8; margin--) {
       leading_byte += (int8_t(in[margin - 1]) > -65);
     }
-    // If the input is long enough, then we have that margin-1 is the fourth
-    // last leading byte.
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
     const size_t safety_margin = size - margin + 1; // to avoid overruns!
     while (pos + 64 + safety_margin <= size) {
       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
       if (input.is_ascii()) {
-        input.store_ascii_as_utf32(utf32_output);
-        utf32_output += 64;
+        input.store_ascii_as_utf16<endian>(utf16_output);
+        utf16_output += 64;
         pos += 64;
       } else {
         // you might think that a for-loop would work, but under Visual Studio,
@@ -39483,7 +54555,7 @@ struct validating_transcoder {
         }
         uint64_t utf8_continuation_mask = input.lt(-65 + 1);
         if (utf8_continuation_mask & 1) {
-          return 0; // we have an error
+          return 0; // error
         }
         uint64_t utf8_leading_mask = ~utf8_continuation_mask;
         uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
@@ -39502,8 +54574,8 @@ struct validating_transcoder {
           // for this section of the code. Hence, there is a limit
           // to how much we can further increase this latency before
           // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf32(
-              in + pos, utf8_end_of_code_point_mask, utf32_output);
+          size_t consumed = convert_masked_utf8_to_utf16<endian>(
+              in + pos, utf8_end_of_code_point_mask, utf16_output);
           pos += consumed;
           utf8_end_of_code_point_mask >>= consumed;
         }
@@ -39517,22 +54589,23 @@ struct validating_transcoder {
       return 0;
     }
     if (pos < size) {
-      size_t howmany =
-          scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+      size_t howmany = scalar::utf8_to_utf16::convert<endian>(
+          in + pos, size - pos, utf16_output);
       if (howmany == 0) {
         return 0;
       }
-      utf32_output += howmany;
+      utf16_output += howmany;
     }
-    return utf32_output - start;
+    return utf16_output - start;
   }
 
+  template <endianness endian>
   simdutf_really_inline result convert_with_errors(const char *in, size_t size,
-                                                   char32_t *utf32_output) {
+                                                   char16_t *utf16_output) {
     size_t pos = 0;
-    char32_t *start{utf32_output};
+    char16_t *start{utf16_output};
     // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
+    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
     // last 16 bytes, and if the data is valid, then it is entirely safe because
     // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
     // generally assume that you have valid UTF-8 input, so we are going to go
@@ -39542,14 +54615,14 @@ struct validating_transcoder {
     for (; margin > 0 && leading_byte < 8; margin--) {
       leading_byte += (int8_t(in[margin - 1]) > -65);
     }
-    // If the input is long enough, then we have that margin-1 is the fourth
-    // last leading byte.
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
     const size_t safety_margin = size - margin + 1; // to avoid overruns!
     while (pos + 64 + safety_margin <= size) {
       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
       if (input.is_ascii()) {
-        input.store_ascii_as_utf32(utf32_output);
-        utf32_output += 64;
+        input.store_ascii_as_utf16<endian>(utf16_output);
+        utf16_output += 64;
         pos += 64;
       } else {
         // you might think that a for-loop would work, but under Visual Studio,
@@ -39570,8 +54643,12 @@ struct validating_transcoder {
         }
         uint64_t utf8_continuation_mask = input.lt(-65 + 1);
         if (errors() || (utf8_continuation_mask & 1)) {
-          result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
-              pos, in + pos, size - pos, utf32_output);
+          // rewind_and_convert_with_errors will seek a potential error from
+          // in+pos onward, with the ability to go back up to pos bytes, and
+          // read size-pos bytes forward.
+          result res =
+              scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+                  pos, in + pos, size - pos, utf16_output);
           res.count += pos;
           return res;
         }
@@ -39592,8 +54669,8 @@ struct validating_transcoder {
           // for this section of the code. Hence, there is a limit
           // to how much we can further increase this latency before
           // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf32(
-              in + pos, utf8_end_of_code_point_mask, utf32_output);
+          size_t consumed = convert_masked_utf8_to_utf16<endian>(
+              in + pos, utf8_end_of_code_point_mask, utf16_output);
           pos += consumed;
           utf8_end_of_code_point_mask >>= consumed;
         }
@@ -39604,22 +54681,30 @@ struct validating_transcoder {
       }
     }
     if (errors()) {
-      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
-          pos, in + pos, size - pos, utf32_output);
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res =
+          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+              pos, in + pos, size - pos, utf16_output);
       res.count += pos;
       return res;
     }
     if (pos < size) {
-      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
-          pos, in + pos, size - pos, utf32_output);
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res =
+          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+              pos, in + pos, size - pos, utf16_output);
       if (res.error) { // In case of error, we want the error position
         res.count += pos;
         return res;
       } else { // In case of success, we want the number of word written
-        utf32_output += res.count;
+        utf16_output += res.count;
       }
     }
-    return result(error_code::SUCCESS, utf32_output - start);
+    return result(error_code::SUCCESS, utf16_output - start);
   }
 
   simdutf_really_inline bool errors() const {
@@ -39627,143 +54712,67 @@ struct validating_transcoder {
   }
 
 }; // struct utf8_checker
-} // namespace utf8_to_utf32
+} // namespace utf8_to_utf16
 } // unnamed namespace
-} // namespace westmere
+} // namespace lasx
 } // namespace simdutf
-/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
-// other functions
-/* begin file src/generic/utf8.h */
+/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+// transcoding from UTF-8 to UTF-32
+/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
 
 namespace simdutf {
-namespace westmere {
+namespace lasx {
 namespace {
-namespace utf8 {
+namespace utf8_to_utf32 {
 
 using namespace simd;
 
-simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
-  size_t pos = 0;
-  size_t count = 0;
-  for (; pos + 64 <= size; pos += 64) {
-    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-    uint64_t utf8_continuation_mask = input.gt(-65);
-    count += count_ones(utf8_continuation_mask);
-  }
-  return count + scalar::utf8::count_code_points(in + pos, size - pos);
-}
-
-simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
-                                                    size_t size) {
-  size_t pos = 0;
-  size_t count = 0;
-  // This algorithm could no doubt be improved!
-  for (; pos + 64 <= size; pos += 64) {
-    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-    uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-    // We count one word for anything that is not a continuation (so
-    // leading bytes).
-    count += 64 - count_ones(utf8_continuation_mask);
-    int64_t utf8_4byte = input.gteq_unsigned(240);
-    count += count_ones(utf8_4byte);
-  }
-  return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
-}
-} // namespace utf8
-} // unnamed namespace
-} // namespace westmere
-} // namespace simdutf
-/* end file src/generic/utf8.h */
-/* begin file src/generic/utf16.h */
-namespace simdutf {
-namespace westmere {
-namespace {
-namespace utf16 {
-
-template <endianness big_endian>
-simdutf_really_inline size_t count_code_points(const char16_t *in,
-                                               size_t size) {
-  size_t pos = 0;
-  size_t count = 0;
-  for (; pos < size / 32 * 32; pos += 32) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    if (!match_system(big_endian)) {
-      input.swap_bytes();
-    }
-    uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
-    count += count_ones(not_pair) / 2;
-  }
-  return count +
-         scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
-}
-
-template <endianness big_endian>
-simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in,
-                                                    size_t size) {
+simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
+                                         char32_t *utf32_output) noexcept {
   size_t pos = 0;
-  size_t count = 0;
-  // This algorithm could no doubt be improved!
-  for (; pos < size / 32 * 32; pos += 32) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    if (!match_system(big_endian)) {
-      input.swap_bytes();
+  char32_t *start{utf32_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if (in.is_ascii()) {
+      in.store_ascii_as_utf32(utf32_output);
+      utf32_output += 64;
+      pos += 64;
+    } else {
+      // -65 is 0b10111111 in two-complement's, so largest possible continuation
+      // byte
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      size_t max_starting_point = (pos + 64) - 12;
+      while (pos < max_starting_point) {
+        size_t consumed = convert_masked_utf8_to_utf32(
+            input + pos, utf8_end_of_code_point_mask, utf32_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
     }
-    uint64_t ascii_mask = input.lteq(0x7F);
-    uint64_t twobyte_mask = input.lteq(0x7FF);
-    uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
-
-    size_t ascii_count = count_ones(ascii_mask) / 2;
-    size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
-    size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
-    size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
-    count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count +
-             ascii_count;
-  }
-  return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
-                                                                   size - pos);
-}
-
-template <endianness big_endian>
-simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
-                                                     size_t size) {
-  return count_code_points<big_endian>(in, size);
-}
-
-simdutf_really_inline void
-change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
-  size_t pos = 0;
-
-  while (pos < size / 32 * 32) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    input.swap_bytes();
-    input.store(reinterpret_cast<uint16_t *>(output));
-    pos += 32;
-    output += 32;
   }
-
-  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
+                                                       utf32_output);
+  return utf32_output - start;
 }
 
-} // namespace utf16
+} // namespace utf8_to_utf32
 } // unnamed namespace
-} // namespace westmere
+} // namespace lasx
 } // namespace simdutf
-/* end file src/generic/utf16.h */
-// transcoding from UTF-8 to Latin 1
-/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
+/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
+/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
 namespace simdutf {
-namespace westmere {
+namespace lasx {
 namespace {
-namespace utf8_to_latin1 {
+namespace utf8_to_utf32 {
 using namespace simd;
 
 simdutf_really_inline simd8<uint8_t>
 check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-  // For UTF-8 to Latin 1, we can allow any ASCII character, and any
-  // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
-  // 0b11000010 and nothing else.
-  //
   // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
   // Bit 1 = Too Long (ASCII followed by continuation)
   // Bit 2 = Overlong 3-byte
@@ -39790,7 +54799,6 @@ check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
   // 1111011_ 1000____
   // 11111___ 1000____
   constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
-  constexpr const uint8_t FORBIDDEN = 0xff;
 
   const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
       // 0_______ ________ <ASCII in byte 1>
@@ -39801,11 +54809,11 @@ check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
       // 1100____ ________ <two byte lead in byte 1>
       TOO_SHORT | OVERLONG_2,
       // 1101____ ________ <two byte lead in byte 1>
-      FORBIDDEN,
+      TOO_SHORT,
       // 1110____ ________ <three byte lead in byte 1>
-      FORBIDDEN,
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
       // 1111____ ________ <four+ byte lead in byte 1>
-      FORBIDDEN);
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
   constexpr const uint8_t CARRY =
       TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
   const simd8<uint8_t> byte_1_low =
@@ -39819,16 +54827,23 @@ check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
               CARRY, CARRY,
 
               // ____0100 ________
-              FORBIDDEN,
+              CARRY | TOO_LARGE,
               // ____0101 ________
-              FORBIDDEN,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
               // ____011_ ________
-              FORBIDDEN, FORBIDDEN,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
 
               // ____1___ ________
-              FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
               // ____1101 ________
-              FORBIDDEN, FORBIDDEN, FORBIDDEN);
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
   const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
       // ________ 0_______ <ASCII in byte 2>
       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
@@ -39847,6 +54862,17 @@ check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
   return (byte_1_high & byte_1_low & byte_2_high);
 }
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
+}
 
 struct validating_transcoder {
   // If this is nonzero, there has been a UTF-8 error.
@@ -39862,33 +54888,33 @@ struct validating_transcoder {
     // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
     // small negative numbers)
     simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-    this->error |= check_special_cases(input, prev1);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
   }
 
   simdutf_really_inline size_t convert(const char *in, size_t size,
-                                       char *latin1_output) {
+                                       char32_t *utf32_output) {
     size_t pos = 0;
-    char *start{latin1_output};
+    char32_t *start{utf32_output};
     // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
+    // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
     // last 16 bytes, and if the data is valid, then it is entirely safe because
     // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
     // generally assume that you have valid UTF-8 input, so we are going to go
     // back from the end counting 16 leading bytes, to give us a good margin.
     size_t leading_byte = 0;
     size_t margin = size;
-    for (; margin > 0 && leading_byte < 16; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) >
-                       -65); // twos complement of -65 is 1011 1111 ...
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
     }
-    // If the input is long enough, then we have that margin-1 is the eight last
-    // leading byte.
+    // If the input is long enough, then we have that margin-1 is the fourth
+    // last leading byte.
     const size_t safety_margin = size - margin + 1; // to avoid overruns!
     while (pos + 64 + safety_margin <= size) {
       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
       if (input.is_ascii()) {
-        input.store((int8_t *)latin1_output);
-        latin1_output += 64;
+        input.store_ascii_as_utf32(utf32_output);
+        utf32_output += 64;
         pos += 64;
       } else {
         // you might think that a for-loop would work, but under Visual Studio,
@@ -39907,9 +54933,10 @@ struct validating_transcoder {
           this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
           this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
         }
-        uint64_t utf8_continuation_mask =
-            input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
-                               // this case, we also have ASCII to account for.
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (utf8_continuation_mask & 1) {
+          return 0; // we have an error
+        }
         uint64_t utf8_leading_mask = ~utf8_continuation_mask;
         uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
         // We process in blocks of up to 12 bytes except possibly
@@ -39927,8 +54954,8 @@ struct validating_transcoder {
           // for this section of the code. Hence, there is a limit
           // to how much we can further increase this latency before
           // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_latin1(
-              in + pos, utf8_end_of_code_point_mask, latin1_output);
+          size_t consumed = convert_masked_utf8_to_utf32(
+              in + pos, utf8_end_of_code_point_mask, utf32_output);
           pos += consumed;
           utf8_end_of_code_point_mask >>= consumed;
         }
@@ -39943,21 +54970,21 @@ struct validating_transcoder {
     }
     if (pos < size) {
       size_t howmany =
-          scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
+          scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
       if (howmany == 0) {
         return 0;
       }
-      latin1_output += howmany;
+      utf32_output += howmany;
     }
-    return latin1_output - start;
+    return utf32_output - start;
   }
 
   simdutf_really_inline result convert_with_errors(const char *in, size_t size,
-                                                   char *latin1_output) {
+                                                   char32_t *utf32_output) {
     size_t pos = 0;
-    char *start{latin1_output};
+    char32_t *start{utf32_output};
     // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
+    // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
     // last 16 bytes, and if the data is valid, then it is entirely safe because
     // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
     // generally assume that you have valid UTF-8 input, so we are going to go
@@ -39967,14 +54994,14 @@ struct validating_transcoder {
     for (; margin > 0 && leading_byte < 8; margin--) {
       leading_byte += (int8_t(in[margin - 1]) > -65);
     }
-    // If the input is long enough, then we have that margin-1 is the eight last
-    // leading byte.
+    // If the input is long enough, then we have that margin-1 is the fourth
+    // last leading byte.
     const size_t safety_margin = size - margin + 1; // to avoid overruns!
     while (pos + 64 + safety_margin <= size) {
       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
       if (input.is_ascii()) {
-        input.store((int8_t *)latin1_output);
-        latin1_output += 64;
+        input.store_ascii_as_utf32(utf32_output);
+        utf32_output += 64;
         pos += 64;
       } else {
         // you might think that a for-loop would work, but under Visual Studio,
@@ -39993,16 +55020,13 @@ struct validating_transcoder {
           this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
           this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
         }
-        if (errors()) {
-          // rewind_and_convert_with_errors will seek a potential error from
-          // in+pos onward, with the ability to go back up to pos bytes, and
-          // read size-pos bytes forward.
-          result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
-              pos, in + pos, size - pos, latin1_output);
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (errors() || (utf8_continuation_mask & 1)) {
+          result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+              pos, in + pos, size - pos, utf32_output);
           res.count += pos;
           return res;
         }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
         uint64_t utf8_leading_mask = ~utf8_continuation_mask;
         uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
         // We process in blocks of up to 12 bytes except possibly
@@ -40020,8 +55044,8 @@ struct validating_transcoder {
           // for this section of the code. Hence, there is a limit
           // to how much we can further increase this latency before
           // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_latin1(
-              in + pos, utf8_end_of_code_point_mask, latin1_output);
+          size_t consumed = convert_masked_utf8_to_utf32(
+              in + pos, utf8_end_of_code_point_mask, utf32_output);
           pos += consumed;
           utf8_end_of_code_point_mask >>= consumed;
         }
@@ -40032,28 +55056,22 @@ struct validating_transcoder {
       }
     }
     if (errors()) {
-      // rewind_and_convert_with_errors will seek a potential error from in+pos
-      // onward, with the ability to go back up to pos bytes, and read size-pos
-      // bytes forward.
-      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
-          pos, in + pos, size - pos, latin1_output);
+      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, utf32_output);
       res.count += pos;
       return res;
     }
     if (pos < size) {
-      // rewind_and_convert_with_errors will seek a potential error from in+pos
-      // onward, with the ability to go back up to pos bytes, and read size-pos
-      // bytes forward.
-      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
-          pos, in + pos, size - pos, latin1_output);
+      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, utf32_output);
       if (res.error) { // In case of error, we want the error position
         res.count += pos;
         return res;
       } else { // In case of success, we want the number of word written
-        latin1_output += res.count;
+        utf32_output += res.count;
       }
     }
-    return result(error_code::SUCCESS, latin1_output - start);
+    return result(error_code::SUCCESS, utf32_output - start);
   }
 
   simdutf_really_inline bool errors() const {
@@ -40061,99 +55079,136 @@ struct validating_transcoder {
   }
 
 }; // struct utf8_checker
-} // namespace utf8_to_latin1
+} // namespace utf8_to_utf32
 } // unnamed namespace
-} // namespace westmere
+} // namespace lasx
 } // namespace simdutf
-/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
-/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
+/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
+
+
+// other functions
+/* begin file src/generic/utf8.h */
 
 namespace simdutf {
-namespace westmere {
+namespace lasx {
 namespace {
-namespace utf8_to_latin1 {
+namespace utf8 {
+
 using namespace simd;
 
-simdutf_really_inline size_t convert_valid(const char *in, size_t size,
-                                           char *latin1_output) {
+simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
   size_t pos = 0;
-  char *start{latin1_output};
-  // In the worst case, we have the haswell kernel which can cause an overflow
-  // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
-  // 16 bytes, and if the data is valid, then it is entirely safe because 16
-  // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
-  // assume that you have valid UTF-8 input, so we are going to go back from the
-  // end counting 8 leading bytes, to give us a good margin.
-  size_t leading_byte = 0;
-  size_t margin = size;
-  for (; margin > 0 && leading_byte < 8; margin--) {
-    leading_byte += (int8_t(in[margin - 1]) >
-                     -65); // twos complement of -65 is 1011 1111 ...
+  size_t count = 0;
+  for (; pos + 64 <= size; pos += 64) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    uint64_t utf8_continuation_mask = input.gt(-65);
+    count += count_ones(utf8_continuation_mask);
   }
-  // If the input is long enough, then we have that margin-1 is the eight last
-  // leading byte.
-  const size_t safety_margin = size - margin + 1; // to avoid overruns!
-  while (pos + 64 + safety_margin <= size) {
+  return count + scalar::utf8::count_code_points(in + pos, size - pos);
+}
+
+simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
+                                                    size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  // This algorithm could no doubt be improved!
+  for (; pos + 64 <= size; pos += 64) {
     simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-    if (input.is_ascii()) {
-      input.store((int8_t *)latin1_output);
-      latin1_output += 64;
-      pos += 64;
-    } else {
-      // you might think that a for-loop would work, but under Visual Studio, it
-      // is not good enough.
-      uint64_t utf8_continuation_mask =
-          input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
-                             // this case, we also have ASCII to account for.
-      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-      // We process in blocks of up to 12 bytes except possibly
-      // for fast paths which may process up to 16 bytes. For the
-      // slow path to work, we should have at least 12 input bytes left.
-      size_t max_starting_point = (pos + 64) - 12;
-      // Next loop is going to run at least five times.
-      while (pos < max_starting_point) {
-        // Performance note: our ability to compute 'consumed' and
-        // then shift and recompute is critical. If there is a
-        // latency of, say, 4 cycles on getting 'consumed', then
-        // the inner loop might have a total latency of about 6 cycles.
-        // Yet we process between 6 to 12 inputs bytes, thus we get
-        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-        // for this section of the code. Hence, there is a limit
-        // to how much we can further increase this latency before
-        // it seriously harms performance.
-        size_t consumed = convert_masked_utf8_to_latin1(
-            in + pos, utf8_end_of_code_point_mask, latin1_output);
-        pos += consumed;
-        utf8_end_of_code_point_mask >>= consumed;
-      }
-      // At this point there may remain between 0 and 12 bytes in the
-      // 64-byte block. These bytes will be processed again. So we have an
-      // 80% efficiency (in the worst case). In practice we expect an
-      // 85% to 90% efficiency.
+    uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+    // We count one word for anything that is not a continuation (so
+    // leading bytes).
+    count += 64 - count_ones(utf8_continuation_mask);
+    int64_t utf8_4byte = input.gteq_unsigned(240);
+    count += count_ones(utf8_4byte);
+  }
+  return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
+}
+} // namespace utf8
+} // unnamed namespace
+} // namespace lasx
+} // namespace simdutf
+/* end file src/generic/utf8.h */
+/* begin file src/generic/utf16.h */
+namespace simdutf {
+namespace lasx {
+namespace {
+namespace utf16 {
+
+template <endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t *in,
+                                               size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos < size / 32 * 32; pos += 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    if (!match_system(big_endian)) {
+      input.swap_bytes();
     }
+    uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+    count += count_ones(not_pair) / 2;
   }
-  if (pos < size) {
-    size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
-                                                           latin1_output);
-    latin1_output += howmany;
+  return count +
+         scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
+}
+
+template <endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in,
+                                                    size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  // This algorithm could no doubt be improved!
+  for (; pos < size / 32 * 32; pos += 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    if (!match_system(big_endian)) {
+      input.swap_bytes();
+    }
+    uint64_t ascii_mask = input.lteq(0x7F);
+    uint64_t twobyte_mask = input.lteq(0x7FF);
+    uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+
+    size_t ascii_count = count_ones(ascii_mask) / 2;
+    size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
+    size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
+    size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+    count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count +
+             ascii_count;
   }
-  return latin1_output - start;
+  return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
+                                                                   size - pos);
 }
 
-} // namespace utf8_to_latin1
-} // namespace
-} // namespace westmere
+template <endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
+                                                     size_t size) {
+  return count_code_points<big_endian>(in, size);
+}
+
+simdutf_really_inline void
+change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
+  size_t pos = 0;
+
+  while (pos < size / 32 * 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    input.swap_bytes();
+    input.store(reinterpret_cast<uint16_t *>(output));
+    pos += 32;
+    output += 32;
+  }
+
+  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+}
+
+} // namespace utf16
+} // unnamed namespace
+} // namespace lasx
 } // namespace simdutf
-  // namespace simdutf
-/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
+/* end file src/generic/utf16.h */
 
 //
 // Implementation-specific overrides
 //
-
 namespace simdutf {
-namespace westmere {
+namespace lasx {
 
 simdutf_warn_unused int
 implementation::detect_encodings(const char *input,
@@ -40184,34 +55239,32 @@ implementation::detect_encodings(const char *input,
 
 simdutf_warn_unused bool
 implementation::validate_utf8(const char *buf, size_t len) const noexcept {
-  return westmere::utf8_validation::generic_validate_utf8(buf, len);
+  return lasx::utf8_validation::generic_validate_utf8(buf, len);
 }
 
 simdutf_warn_unused result implementation::validate_utf8_with_errors(
     const char *buf, size_t len) const noexcept {
-  return westmere::utf8_validation::generic_validate_utf8_with_errors(buf, len);
+  return lasx::utf8_validation::generic_validate_utf8_with_errors(buf, len);
 }
 
 simdutf_warn_unused bool
 implementation::validate_ascii(const char *buf, size_t len) const noexcept {
-  return westmere::utf8_validation::generic_validate_ascii(buf, len);
+  return lasx::utf8_validation::generic_validate_ascii(buf, len);
 }
 
 simdutf_warn_unused result implementation::validate_ascii_with_errors(
     const char *buf, size_t len) const noexcept {
-  return westmere::utf8_validation::generic_validate_ascii_with_errors(buf,
-                                                                       len);
+  return lasx::utf8_validation::generic_validate_ascii_with_errors(buf, len);
 }
 
 simdutf_warn_unused bool
 implementation::validate_utf16le(const char16_t *buf,
                                  size_t len) const noexcept {
   if (simdutf_unlikely(len == 0)) {
-    // empty input is valid UTF-16. protect the implementation from
-    // handling nullptr
+    // empty input is valid. protected the implementation from nullptr.
     return true;
   }
-  const char16_t *tail = sse_validate_utf16<endianness::LITTLE>(buf, len);
+  const char16_t *tail = lasx_validate_utf16<endianness::LITTLE>(buf, len);
   if (tail) {
     return scalar::utf16::validate<endianness::LITTLE>(tail,
                                                        len - (tail - buf));
@@ -40224,11 +55277,10 @@ simdutf_warn_unused bool
 implementation::validate_utf16be(const char16_t *buf,
                                  size_t len) const noexcept {
   if (simdutf_unlikely(len == 0)) {
-    // empty input is valid UTF-16. protect the implementation from
-    // handling nullptr
+    // empty input is valid. protected the implementation from nullptr.
     return true;
   }
-  const char16_t *tail = sse_validate_utf16<endianness::BIG>(buf, len);
+  const char16_t *tail = lasx_validate_utf16<endianness::BIG>(buf, len);
   if (tail) {
     return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
   } else {
@@ -40238,7 +55290,10 @@ implementation::validate_utf16be(const char16_t *buf,
 
 simdutf_warn_unused result implementation::validate_utf16le_with_errors(
     const char16_t *buf, size_t len) const noexcept {
-  result res = sse_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
+  result res = lasx_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
   if (res.count != len) {
     result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(
         buf + res.count, len - res.count);
@@ -40250,7 +55305,10 @@ simdutf_warn_unused result implementation::validate_utf16le_with_errors(
 
 simdutf_warn_unused result implementation::validate_utf16be_with_errors(
     const char16_t *buf, size_t len) const noexcept {
-  result res = sse_validate_utf16_with_errors<endianness::BIG>(buf, len);
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
+  result res = lasx_validate_utf16_with_errors<endianness::BIG>(buf, len);
   if (res.count != len) {
     result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(
         buf + res.count, len - res.count);
@@ -40263,11 +55321,10 @@ simdutf_warn_unused result implementation::validate_utf16be_with_errors(
 simdutf_warn_unused bool
 implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
   if (simdutf_unlikely(len == 0)) {
-    // empty input is valid UTF-32. protect the implementation from
-    // handling nullptr
+    // empty input is valid. protected the implementation from nullptr.
     return true;
   }
-  const char32_t *tail = sse_validate_utf32le(buf, len);
+  const char32_t *tail = lasx_validate_utf32le(buf, len);
   if (tail) {
     return scalar::utf32::validate(tail, len - (tail - buf));
   } else {
@@ -40277,12 +55334,10 @@ implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
 
 simdutf_warn_unused result implementation::validate_utf32_with_errors(
     const char32_t *buf, size_t len) const noexcept {
-  if (len == 0) {
-    // empty input is valid UTF-32. protect the implementation from
-    // handling nullptr
+  if (simdutf_unlikely(len == 0)) {
     return result(error_code::SUCCESS, 0);
   }
-  result res = sse_validate_utf32le_with_errors(buf, len);
+  result res = lasx_validate_utf32le_with_errors(buf, len);
   if (res.count != len) {
     result scalar_res =
         scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
@@ -40294,9 +55349,8 @@ simdutf_warn_unused result implementation::validate_utf32_with_errors(
 
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
     const char *buf, size_t len, char *utf8_output) const noexcept {
-
   std::pair<const char *, char *> ret =
-      sse_convert_latin1_to_utf8(buf, len, utf8_output);
+      lasx_convert_latin1_to_utf8(buf, len, utf8_output);
   size_t converted_chars = ret.second - utf8_output;
 
   if (ret.first != buf + len) {
@@ -40304,25 +55358,18 @@ simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
         ret.first, len - (ret.first - buf), ret.second);
     converted_chars += scalar_converted_chars;
   }
-
   return converted_chars;
 }
 
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
     const char *buf, size_t len, char16_t *utf16_output) const noexcept {
   std::pair<const char *, char16_t *> ret =
-      sse_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
+      lasx_convert_latin1_to_utf16le(buf, len, utf16_output);
   size_t converted_chars = ret.second - utf16_output;
   if (ret.first != buf + len) {
     const size_t scalar_converted_chars =
         scalar::latin1_to_utf16::convert<endianness::LITTLE>(
             ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_converted_chars == 0) {
-      return 0;
-    }
     converted_chars += scalar_converted_chars;
   }
   return converted_chars;
@@ -40331,18 +55378,12 @@ simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
     const char *buf, size_t len, char16_t *utf16_output) const noexcept {
   std::pair<const char *, char16_t *> ret =
-      sse_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
+      lasx_convert_latin1_to_utf16be(buf, len, utf16_output);
   size_t converted_chars = ret.second - utf16_output;
   if (ret.first != buf + len) {
     const size_t scalar_converted_chars =
         scalar::latin1_to_utf16::convert<endianness::BIG>(
             ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_converted_chars == 0) {
-      return 0;
-    }
     converted_chars += scalar_converted_chars;
   }
   return converted_chars;
@@ -40351,17 +55392,11 @@ simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
     const char *buf, size_t len, char32_t *utf32_output) const noexcept {
   std::pair<const char *, char32_t *> ret =
-      sse_convert_latin1_to_utf32(buf, len, utf32_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
+      lasx_convert_latin1_to_utf32(buf, len, utf32_output);
   size_t converted_chars = ret.second - utf32_output;
   if (ret.first != buf + len) {
     const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
         ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_converted_chars == 0) {
-      return 0;
-    }
     converted_chars += scalar_converted_chars;
   }
   return converted_chars;
@@ -40369,19 +55404,117 @@ simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
 
 simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
     const char *buf, size_t len, char *latin1_output) const noexcept {
+  size_t pos = 0;
+  char *output_start{latin1_output};
+  // Performance degradation when memory address is not 32-byte aligned
+  while (((uint64_t)latin1_output & 0x1F) && pos < len) {
+    if (buf[pos] & 0x80) {
+      if (pos + 1 >= len)
+        return 0;
+      if ((buf[pos] & 0b11100000) == 0b11000000) {
+        if ((buf[pos + 1] & 0b11000000) != 0b10000000)
+          return 0;
+        uint32_t code_point =
+            (buf[pos] & 0b00011111) << 6 | (buf[pos + 1] & 0b00111111);
+        if (code_point < 0x80 || 0xFF < code_point) {
+          return 0;
+        }
+        *latin1_output++ = char(code_point);
+        pos += 2;
+      } else {
+        return 0;
+      }
+    } else {
+      *latin1_output++ = char(buf[pos]);
+      pos++;
+    }
+  }
+  size_t convert_size = latin1_output - output_start;
+  if (pos == len)
+    return convert_size;
   utf8_to_latin1::validating_transcoder converter;
-  return converter.convert(buf, len, latin1_output);
+  size_t convert_result =
+      converter.convert(buf + pos, len - pos, latin1_output);
+  return convert_result ? convert_size + convert_result : 0;
 }
 
 simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
     const char *buf, size_t len, char *latin1_output) const noexcept {
+  size_t pos = 0;
+  char *output_start{latin1_output};
+  // Performance degradation when memory address is not 32-byte aligned
+  while (((uint64_t)latin1_output & 0x1F) && pos < len) {
+    if (buf[pos] & 0x80) {
+      if ((buf[pos] & 0b11100000) == 0b11000000) {
+        if (pos + 1 >= len)
+          return result(error_code::TOO_SHORT, pos);
+        if ((buf[pos + 1] & 0b11000000) != 0b10000000)
+          return result(error_code::TOO_SHORT, pos);
+        uint32_t code_point =
+            (buf[pos] & 0b00011111) << 6 | (buf[pos + 1] & 0b00111111);
+        if (code_point < 0x80)
+          return result(error_code::OVERLONG, pos);
+        if (0xFF < code_point)
+          return result(error_code::TOO_LARGE, pos);
+        *latin1_output++ = char(code_point);
+        pos += 2;
+      } else if ((buf[pos] & 0b11110000) == 0b11100000) {
+        return result(error_code::TOO_LARGE, pos);
+      } else if ((buf[pos] & 0b11111000) == 0b11110000) {
+        return result(error_code::TOO_LARGE, pos);
+      } else {
+        if ((buf[pos] & 0b11000000) == 0b10000000) {
+          return result(error_code::TOO_LONG, pos);
+        }
+        return result(error_code::HEADER_BITS, pos);
+      }
+    } else {
+      *latin1_output++ = char(buf[pos]);
+      pos++;
+    }
+  }
+  size_t convert_size = latin1_output - output_start;
+  if (pos == len)
+    return result(error_code::SUCCESS, convert_size);
+
   utf8_to_latin1::validating_transcoder converter;
-  return converter.convert_with_errors(buf, len, latin1_output);
+  result res =
+      converter.convert_with_errors(buf + pos, len - pos, latin1_output);
+  return res.error ? result(res.error, res.count + pos)
+                   : result(res.error, res.count + convert_size);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
     const char *buf, size_t len, char *latin1_output) const noexcept {
-  return westmere::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+  size_t pos = 0;
+  char *output_start{latin1_output};
+  // Performance degradation when memory address is not 32-byte aligned
+  while (((uint64_t)latin1_output & 0x1F) && pos < len) {
+    if (buf[pos] & 0x80) {
+      if (pos + 1 >= len)
+        break;
+      if ((buf[pos] & 0b11100000) == 0b11000000) {
+        if ((buf[pos + 1] & 0b11000000) != 0b10000000)
+          return 0;
+        uint32_t code_point =
+            (buf[pos] & 0b00011111) << 6 | (buf[pos + 1] & 0b00111111);
+        *latin1_output++ = char(code_point);
+        pos += 2;
+      } else {
+        return 0;
+      }
+    } else {
+      *latin1_output++ = char(buf[pos]);
+      pos++;
+    }
+  }
+  size_t convert_size = latin1_output - output_start;
+  if (pos == len)
+    return convert_size;
+
+  size_t convert_result =
+      lasx::utf8_to_latin1::convert_valid(buf + pos, len - pos, latin1_output);
+  return convert_result ? convert_size + convert_result : 0;
 }
 
 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
@@ -40441,7 +55574,7 @@ simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
 simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
   std::pair<const char16_t *, char *> ret =
-      sse_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
+      lasx_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -40462,7 +55595,7 @@ simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
 simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
   std::pair<const char16_t *, char *> ret =
-      sse_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
+      lasx_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -40484,7 +55617,7 @@ simdutf_warn_unused result
 implementation::convert_utf16le_to_latin1_with_errors(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
   std::pair<result, char *> ret =
-      sse_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
+      lasx_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
           buf, len, latin1_output);
   if (ret.first.error) {
     return ret.first;
@@ -40511,8 +55644,8 @@ simdutf_warn_unused result
 implementation::convert_utf16be_to_latin1_with_errors(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
   std::pair<result, char *> ret =
-      sse_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
-                                                               latin1_output);
+      lasx_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
+                                                                latin1_output);
   if (ret.first.error) {
     return ret.first;
   } // Can return directly since scalar fallback already found correct
@@ -40536,20 +55669,20 @@ implementation::convert_utf16be_to_latin1_with_errors(
 
 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  // optimization opportunity: we could provide an optimized function.
+  // optimization opportunity: implement a custom function.
   return convert_utf16be_to_latin1(buf, len, latin1_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  // optimization opportunity: we could provide an optimized function.
+  // optimization opportunity: implement a custom function.
   return convert_utf16le_to_latin1(buf, len, latin1_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
     const char16_t *buf, size_t len, char *utf8_output) const noexcept {
   std::pair<const char16_t *, char *> ret =
-      sse_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
+      lasx_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -40569,7 +55702,7 @@ simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
     const char16_t *buf, size_t len, char *utf8_output) const noexcept {
   std::pair<const char16_t *, char *> ret =
-      sse_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
+      lasx_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -40591,8 +55724,8 @@ simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char *> ret =
-      westmere::sse_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(
-          buf, len, utf8_output);
+      lasx_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len,
+                                                                 utf8_output);
   if (ret.first.error) {
     return ret.first;
   } // Can return directly since scalar fallback already found correct
@@ -40619,8 +55752,8 @@ simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char *> ret =
-      westmere::sse_convert_utf16_to_utf8_with_errors<endianness::BIG>(
-          buf, len, utf8_output);
+      lasx_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len,
+                                                              utf8_output);
   if (ret.first.error) {
     return ret.first;
   } // Can return directly since scalar fallback already found correct
@@ -40652,59 +55785,13 @@ simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
   return convert_utf16be_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  std::pair<const char32_t *, char *> ret =
-      sse_convert_utf32_to_latin1(buf, len, latin1_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - latin1_output;
-  // if (ret.first != buf + len) {
-  if (ret.first < buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
-        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of
-  // code units written even if finished
-  std::pair<result, char *> ret =
-      westmere::sse_convert_utf32_to_latin1_with_errors(buf, len,
-                                                        latin1_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
-        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count =
-      ret.second -
-      latin1_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  // optimization opportunity: we could provide an optimized function.
-  return convert_utf32_to_latin1(buf, len, latin1_output);
-}
-
 simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
     const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return 0;
+  }
   std::pair<const char32_t *, char *> ret =
-      sse_convert_utf32_to_utf8(buf, len, utf8_output);
+      lasx_convert_utf32_to_utf8(buf, len, utf8_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -40722,10 +55809,13 @@ simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
 
 simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
     const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char *> ret =
-      westmere::sse_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+      lasx_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
   if (ret.first.count != len) {
     result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
         buf + ret.first.count, len - ret.first.count, ret.second);
@@ -40745,7 +55835,7 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
     const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
   std::pair<const char16_t *, char32_t *> ret =
-      sse_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+      lasx_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -40765,7 +55855,7 @@ simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
     const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
   std::pair<const char16_t *, char32_t *> ret =
-      sse_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+      lasx_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -40787,8 +55877,8 @@ simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char32_t *> ret =
-      westmere::sse_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(
-          buf, len, utf32_output);
+      lasx_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len,
+                                                                  utf32_output);
   if (ret.first.error) {
     return ret.first;
   } // Can return directly since scalar fallback already found correct
@@ -40815,8 +55905,8 @@ simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char32_t *> ret =
-      westmere::sse_convert_utf16_to_utf32_with_errors<endianness::BIG>(
-          buf, len, utf32_output);
+      lasx_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len,
+                                                               utf32_output);
   if (ret.first.error) {
     return ret.first;
   } // Can return directly since scalar fallback already found correct
@@ -40838,15 +55928,77 @@ simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
   return ret.first;
 }
 
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char32_t *, char *> ret =
+      lasx_convert_utf32_to_latin1(buf, len, latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
+
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<result, char *> ret =
+      lasx_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
+        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      latin1_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char32_t *, char *> ret =
+      lasx_convert_utf32_to_latin1(buf, len, latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
+
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert_valid(
+        ret.first, len - (ret.first - buf), ret.second);
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
     const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  // optimization opportunity: implement a custom function.
   return convert_utf32_to_utf8(buf, len, utf8_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
     const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
   std::pair<const char32_t *, char16_t *> ret =
-      sse_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+      lasx_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -40860,13 +56012,14 @@ simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
     }
     saved_bytes += scalar_saved_bytes;
   }
+
   return saved_bytes;
 }
 
 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
     const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
   std::pair<const char32_t *, char16_t *> ret =
-      sse_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+      lasx_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -40888,8 +56041,8 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char16_t *> ret =
-      westmere::sse_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(
-          buf, len, utf16_output);
+      lasx_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len,
+                                                                  utf16_output);
   if (ret.first.count != len) {
     result scalar_res =
         scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
@@ -40912,8 +56065,8 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char16_t *> ret =
-      westmere::sse_convert_utf32_to_utf16_with_errors<endianness::BIG>(
-          buf, len, utf16_output);
+      lasx_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len,
+                                                               utf16_output);
   if (ret.first.count != len) {
     result scalar_res =
         scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
@@ -40969,7 +56122,23 @@ simdutf_warn_unused size_t implementation::count_utf16be(
 
 simdutf_warn_unused size_t
 implementation::count_utf8(const char *input, size_t length) const noexcept {
-  return utf8::count_code_points(input, length);
+  size_t pos = 0;
+  size_t count = 0;
+  // Performance degradation when memory address is not 32-byte aligned
+  while ((((uint64_t)input + pos) & 0x1F && pos < length)) {
+    if (input[pos++] > -65) {
+      count++;
+    }
+  }
+  __m256i v_bf = __lasx_xvldi(0xBF); // 0b10111111
+  for (; pos + 32 <= length; pos += 32) {
+    __m256i in = __lasx_xvld(reinterpret_cast<const int8_t *>(input + pos), 0);
+    __m256i utf8_count =
+        __lasx_xvpcnt_h(__lasx_xvmskltz_b(__lasx_xvslt_b(v_bf, in)));
+    count = count + __lasx_xvpickve2gr_wu(utf8_count, 0) +
+            __lasx_xvpickve2gr_wu(utf8_count, 4);
+  }
+  return count + scalar::utf8::count_code_points(input + pos, length - pos);
 }
 
 simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
@@ -40979,12 +56148,29 @@ simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
 
 simdutf_warn_unused size_t
 implementation::latin1_length_from_utf16(size_t length) const noexcept {
-  return scalar::utf16::latin1_length_from_utf16(length);
+  return length;
 }
 
 simdutf_warn_unused size_t
 implementation::latin1_length_from_utf32(size_t length) const noexcept {
-  return scalar::utf32::latin1_length_from_utf32(length);
+  return length;
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+    const char *input, size_t length) const noexcept {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
+  const uint8_t *data_end = data + length;
+  uint64_t result = 0;
+  while (data + 16 < data_end) {
+    uint64_t two_bytes = 0;
+    __m128i input_vec = __lsx_vld(data, 0);
+    two_bytes =
+        __lsx_vpickve2gr_hu(__lsx_vpcnt_h(__lsx_vmskltz_b(input_vec)), 0);
+    result += 16 + two_bytes;
+    data += 16;
+  }
+  return result + scalar::latin1::utf8_length_from_latin1((const char *)data,
+                                                          data_end - data);
 }
 
 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
@@ -40999,72 +56185,12 @@ simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
 
 simdutf_warn_unused size_t
 implementation::utf16_length_from_latin1(size_t length) const noexcept {
-  return scalar::latin1::utf16_length_from_latin1(length);
+  return length;
 }
 
 simdutf_warn_unused size_t
 implementation::utf32_length_from_latin1(size_t length) const noexcept {
-  return scalar::latin1::utf32_length_from_latin1(length);
-}
-
-simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
-    const char *input, size_t len) const noexcept {
-  const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
-  size_t answer = len / sizeof(__m128i) * sizeof(__m128i);
-  size_t i = 0;
-  if (answer >= 2048) { // long strings optimization
-    __m128i two_64bits = _mm_setzero_si128();
-    while (i + sizeof(__m128i) <= len) {
-      __m128i runner = _mm_setzero_si128();
-      size_t iterations = (len - i) / sizeof(__m128i);
-      if (iterations > 255) {
-        iterations = 255;
-      }
-      size_t max_i = i + iterations * sizeof(__m128i) - sizeof(__m128i);
-      for (; i + 4 * sizeof(__m128i) <= max_i; i += 4 * sizeof(__m128i)) {
-        __m128i input1 = _mm_loadu_si128((const __m128i *)(str + i));
-        __m128i input2 =
-            _mm_loadu_si128((const __m128i *)(str + i + sizeof(__m128i)));
-        __m128i input3 =
-            _mm_loadu_si128((const __m128i *)(str + i + 2 * sizeof(__m128i)));
-        __m128i input4 =
-            _mm_loadu_si128((const __m128i *)(str + i + 3 * sizeof(__m128i)));
-        __m128i input12 =
-            _mm_add_epi8(_mm_cmpgt_epi8(_mm_setzero_si128(), input1),
-                         _mm_cmpgt_epi8(_mm_setzero_si128(), input2));
-        __m128i input34 =
-            _mm_add_epi8(_mm_cmpgt_epi8(_mm_setzero_si128(), input3),
-                         _mm_cmpgt_epi8(_mm_setzero_si128(), input4));
-        __m128i input1234 = _mm_add_epi8(input12, input34);
-        runner = _mm_sub_epi8(runner, input1234);
-      }
-      for (; i <= max_i; i += sizeof(__m128i)) {
-        __m128i more_input = _mm_loadu_si128((const __m128i *)(str + i));
-        runner = _mm_sub_epi8(runner,
-                              _mm_cmpgt_epi8(_mm_setzero_si128(), more_input));
-      }
-      two_64bits =
-          _mm_add_epi64(two_64bits, _mm_sad_epu8(runner, _mm_setzero_si128()));
-    }
-    answer +=
-        _mm_extract_epi64(two_64bits, 0) + _mm_extract_epi64(two_64bits, 1);
-  } else if (answer > 0) { // short string optimization
-    for (; i + 2 * sizeof(__m128i) <= len; i += 2 * sizeof(__m128i)) {
-      __m128i latin = _mm_loadu_si128((const __m128i *)(input + i));
-      uint16_t non_ascii = (uint16_t)_mm_movemask_epi8(latin);
-      answer += count_ones(non_ascii);
-      latin = _mm_loadu_si128((const __m128i *)(input + i) + 1);
-      non_ascii = (uint16_t)_mm_movemask_epi8(latin);
-      answer += count_ones(non_ascii);
-    }
-    for (; i + sizeof(__m128i) <= len; i += sizeof(__m128i)) {
-      __m128i latin = _mm_loadu_si128((const __m128i *)(input + i));
-      uint16_t non_ascii = (uint16_t)_mm_movemask_epi8(latin);
-      answer += count_ones(non_ascii);
-    }
-  }
-  return answer + scalar::latin1::utf8_length_from_latin1(
-                      reinterpret_cast<const char *>(str + i), len - i);
+  return length;
 }
 
 simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
@@ -41084,35 +56210,35 @@ simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
 
 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
     const char32_t *input, size_t length) const noexcept {
-  const __m128i v_00000000 = _mm_setzero_si128();
-  const __m128i v_ffffff80 = _mm_set1_epi32((uint32_t)0xffffff80);
-  const __m128i v_fffff800 = _mm_set1_epi32((uint32_t)0xfffff800);
-  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
+  __m256i v_80 = __lasx_xvrepli_w(0x80); /*0x00000080*/
+  __m256i v_800 = __lasx_xvldi(-3832);   /*0x00000800*/
+  __m256i v_10000 = __lasx_xvldi(-3583); /*0x00010000*/
   size_t pos = 0;
   size_t count = 0;
-  for (; pos + 4 <= length; pos += 4) {
-    __m128i in = _mm_loadu_si128((__m128i *)(input + pos));
-    const __m128i ascii_bytes_bytemask =
-        _mm_cmpeq_epi32(_mm_and_si128(in, v_ffffff80), v_00000000);
-    const __m128i one_two_bytes_bytemask =
-        _mm_cmpeq_epi32(_mm_and_si128(in, v_fffff800), v_00000000);
-    const __m128i two_bytes_bytemask =
-        _mm_xor_si128(one_two_bytes_bytemask, ascii_bytes_bytemask);
-    const __m128i one_two_three_bytes_bytemask =
-        _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
-    const __m128i three_bytes_bytemask =
-        _mm_xor_si128(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
-    const uint16_t ascii_bytes_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(ascii_bytes_bytemask));
-    const uint16_t two_bytes_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(two_bytes_bytemask));
-    const uint16_t three_bytes_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(three_bytes_bytemask));
-
-    size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
-    size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
-    size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
-    count += 16 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
+  for (; pos + 8 <= length; pos += 8) {
+    __m256i in =
+        __lasx_xvld(reinterpret_cast<const uint32_t *>(input + pos), 0);
+    __m256i ascii_bytes_bytemask = __lasx_xvslt_w(in, v_80);
+    __m256i one_two_bytes_bytemask = __lasx_xvslt_w(in, v_800);
+    __m256i two_bytes_bytemask =
+        __lasx_xvxor_v(one_two_bytes_bytemask, ascii_bytes_bytemask);
+    __m256i three_bytes_bytemask =
+        __lasx_xvxor_v(__lasx_xvslt_w(in, v_10000), one_two_bytes_bytemask);
+
+    __m256i ascii_bytes =
+        __lasx_xvpcnt_w(__lasx_xvmskltz_w(ascii_bytes_bytemask));
+    const uint32_t ascii_bytes_count = __lasx_xvpickve2gr_wu(ascii_bytes, 0) +
+                                       __lasx_xvpickve2gr_wu(ascii_bytes, 4);
+    __m256i two_bytes = __lasx_xvpcnt_w(__lasx_xvmskltz_w(two_bytes_bytemask));
+    const uint32_t two_bytes_count = __lasx_xvpickve2gr_wu(two_bytes, 0) +
+                                     __lasx_xvpickve2gr_wu(two_bytes, 4);
+    __m256i three_bytes =
+        __lasx_xvpcnt_w(__lasx_xvmskltz_w(three_bytes_bytemask));
+    const uint32_t three_bytes_count = __lasx_xvpickve2gr_wu(three_bytes, 0) +
+                                       __lasx_xvpickve2gr_wu(three_bytes, 4);
+
+    count +=
+        32 - 3 * ascii_bytes_count - 2 * two_bytes_count - three_bytes_count;
   }
   return count +
          scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
@@ -41120,17 +56246,14 @@ simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
 
 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
     const char32_t *input, size_t length) const noexcept {
-  const __m128i v_00000000 = _mm_setzero_si128();
-  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
+  __m128i v_ffff = __lsx_vldi(-2304); /*0x0000ffff*/
   size_t pos = 0;
   size_t count = 0;
   for (; pos + 4 <= length; pos += 4) {
-    __m128i in = _mm_loadu_si128((__m128i *)(input + pos));
-    const __m128i surrogate_bytemask =
-        _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
-    const uint16_t surrogate_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(surrogate_bytemask));
-    size_t surrogate_count = (16 - count_ones(surrogate_bitmask)) / 4;
+    __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input + pos), 0);
+    __m128i surrogate_bytemask = __lsx_vslt_wu(v_ffff, in);
+    size_t surrogate_count = __lsx_vpickve2gr_bu(
+        __lsx_vpcnt_b(__lsx_vmskltz_w(surrogate_bytemask)), 0);
     count += 4 + surrogate_count;
   }
   return count +
@@ -41206,18 +56329,12 @@ size_t implementation::binary_to_base64(const char *input, size_t length,
     return encode_base64<false>(output, input, length, options);
   }
 }
-} // namespace westmere
+} // namespace lasx
 } // namespace simdutf
 
-/* begin file src/simdutf/westmere/end.h */
-#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
-// nothing needed.
-#else
-SIMDUTF_UNTARGET_REGION
-#endif
-
-/* end file src/simdutf/westmere/end.h */
-/* end file src/westmere/implementation.cpp */
+/* begin file src/simdutf/lasx/end.h */
+/* end file src/simdutf/lasx/end.h */
+/* end file src/lasx/implementation.cpp */
 #endif
 
 SIMDUTF_POP_DISABLE_WARNINGS
diff --git a/deps/simdutf/simdutf.h b/deps/simdutf/simdutf.h
index 5f82ca372ccfe3..2d984f40e7bc3f 100644
--- a/deps/simdutf/simdutf.h
+++ b/deps/simdutf/simdutf.h
@@ -1,4 +1,4 @@
-/* auto-generated on 2024-11-21 10:33:28 -0500. Do not edit! */
+/* auto-generated on 2024-12-10 14:54:53 -0500. Do not edit! */
 /* begin file include/simdutf.h */
 #ifndef SIMDUTF_H
 #define SIMDUTF_H
@@ -178,7 +178,12 @@
   #endif
 
 #elif defined(__loongarch_lp64)
-// LoongArch 64-bit
+  #if defined(__loongarch_sx) && defined(__loongarch_asx)
+    #define SIMDUTF_IS_LSX 1
+    #define SIMDUTF_IS_LASX 1
+  #elif defined(__loongarch_sx)
+    #define SIMDUTF_IS_LSX 1
+  #endif
 #else
   // The simdutf library is designed
   // for 64-bit processors and it seems that you are not
@@ -670,7 +675,7 @@ SIMDUTF_DISABLE_UNDESIRED_WARNINGS
 #define SIMDUTF_SIMDUTF_VERSION_H
 
 /** The version of simdutf being used (major.minor.revision) */
-#define SIMDUTF_VERSION "5.6.3"
+#define SIMDUTF_VERSION "5.6.4"
 
 namespace simdutf {
 enum {
@@ -685,7 +690,7 @@ enum {
   /**
    * The revision (major.minor.REVISION) of simdutf being used.
    */
-  SIMDUTF_VERSION_REVISION = 3
+  SIMDUTF_VERSION_REVISION = 4
 };
 } // namespace simdutf
 
@@ -796,6 +801,8 @@ enum instruction_set {
   AVX512VPOPCNTDQ = 0x2000,
   RVV = 0x4000,
   ZVBB = 0x8000,
+  LSX = 0x40000,
+  LASX = 0x80000,
 };
 
 #if defined(__PPC64__)
@@ -987,6 +994,28 @@ static inline uint32_t detect_supported_architectures() {
   }
   return host_isa;
 }
+#elif defined(__loongarch__)
+  #if defined(__linux__)
+    #include <sys/auxv.h>
+  // bits/hwcap.h
+  // #define HWCAP_LOONGARCH_LSX             (1 << 4)
+  // #define HWCAP_LOONGARCH_LASX            (1 << 5)
+  #endif
+
+static inline uint32_t detect_supported_architectures() {
+  uint32_t host_isa = instruction_set::DEFAULT;
+  #if defined(__linux__)
+  uint64_t hwcap = 0;
+  hwcap = getauxval(AT_HWCAP);
+  if (hwcap & HWCAP_LOONGARCH_LSX) {
+    host_isa |= instruction_set::LSX;
+  }
+  if (hwcap & HWCAP_LOONGARCH_LASX) {
+    host_isa |= instruction_set::LASX;
+  }
+  #endif
+  return host_isa;
+}
 #else // fallback
 
 // includes 32-bit ARM.