From b0ce85c3713d071f5c4999f7cac764f199e6585d Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Tue, 29 Oct 2024 04:30:10 +0000 Subject: [PATCH] optimize unaligned memory load --- Objects/unicodeobject.c | 205 ++++++++++++++++++++-------------------- 1 file changed, 103 insertions(+), 102 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 0519a09cccc655c..ac388cca72e985f 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5036,70 +5036,15 @@ PyUnicode_DecodeUTF8(const char *s, # error C 'size_t' size should be either 4 or 8! #endif -static Py_ssize_t -ascii_decode(const char *start, const char *end, Py_UCS1 *dest) -{ - const char *p = start; - -#if SIZEOF_SIZE_T <= SIZEOF_VOID_P - if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T) - && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T)) - { - /* Fast path, see in STRINGLIB(utf8_decode) for - an explanation. */ - /* Help allocation */ - const char *_p = p; - Py_UCS1 * q = dest; - while (_p + SIZEOF_SIZE_T <= end) { - size_t value = *(const size_t *) _p; - if (value & ASCII_CHAR_MASK) - break; - *((size_t *)q) = value; - _p += SIZEOF_SIZE_T; - q += SIZEOF_SIZE_T; - } - p = _p; - while (p < end) { - if ((unsigned char)*p & 0x80) - break; - *q++ = *p++; - } - return p - start; - } -#endif - while (p < end) { - /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h - for an explanation. */ - if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) { - /* Help allocation */ - const char *_p = p; - while (_p + SIZEOF_SIZE_T <= end) { - size_t value = *(const size_t *) _p; - if (value & ASCII_CHAR_MASK) - break; - _p += SIZEOF_SIZE_T; - } - p = _p; - if (_p == end) - break; - } - if ((unsigned char)*p & 0x80) - break; - ++p; - } - memcpy(dest, start, p - start); - return p - start; -} - #if (defined(__clang__) || defined(__GNUC__)) -#define HAS_CTZ 1 +#define HAVE_CTZ 1 static inline unsigned int ctz(size_t v) { return __builtin_ctzll((unsigned long long)v); } #elif defined(_MSC_VER) -#define HAS_CTZ 1 +#define HAVE_CTZ 1 static inline unsigned int ctz(size_t v) { @@ -5113,24 +5058,79 @@ ctz(size_t v) } #endif +#if HAVE_CTZ +// load p[0]..p[size-1] as a little-endian size_t +// without unaligned access nor read ahead. +static size_t +load_unaligned(const unsigned char *p, size_t size) +{ + assert(0 <= size && size <= SIZEOF_SIZE_T); + union { + size_t s; + unsigned char b[SIZEOF_SIZE_T]; + } u; + u.s = 0; + switch (size) { + case 8: + u.b[7] = p[7]; + // fall through + case 7: + u.b[6] = p[6]; + // fall through + case 6: + u.b[5] = p[5]; + // fall through + case 5: + u.b[4] = p[4]; + // fall through + case 4: + u.b[3] = p[3]; + // fall through + case 3: + u.b[2] = p[2]; + // fall through + case 2: + u.b[1] = p[1]; + // fall through + case 1: + u.b[0] = p[0]; + break; + case 0: + break; + default: + Py_UNREACHABLE(); + } + return u.s; +} +#endif + static Py_ssize_t find_first_nonascii(const unsigned char *start, const unsigned char *end) { const unsigned char *p = start; - if (end - start > SIZEOF_SIZE_T + ALIGNOF_SIZE_T) { - while (!_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) { - if ((unsigned char)*p & 0x80) { + if (end - start >= SIZEOF_SIZE_T) { + const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T); +#ifdef HAVE_CTZ + size_t u = load_unaligned(p, p2 - p) & ASCII_CHAR_MASK; + if (u) { + return p - start + (ctz(u) - 7) / 8; + } + p = p2; +#else + while (p < p2) { + if (*p & 0x80) { return p - start; } p++; } +#endif const unsigned char *e = end - SIZEOF_SIZE_T; while (p <= e) { - size_t value = (*(const size_t *)p) & ASCII_CHAR_MASK; - if (value) { -#if PY_LITTLE_ENDIAN && HAS_CTZ - return p - start + (ctz(value) - 7) / 8; + size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK; + if (u) { +#if PY_LITTLE_ENDIAN && HAVE_CTZ + return p - start + (ctz(u) - 7) / 8; #else // big endian and minor compilers are difficult to test. // fallback to per byte check. @@ -5140,47 +5140,15 @@ find_first_nonascii(const unsigned char *start, const unsigned char *end) p += SIZEOF_SIZE_T; } } -#if HAS_CTZ - // This part looks bit tricky, but decoding short ASCII is super important. - // Since we copy from p to size_t manually, this part works fine with big endian. - while (p < end) { - size_t u = (size_t)(p[0]); - switch (end - p) { - default: -#if SIZEOF_SIZE_T == 8 - u |= (size_t)(p[7]) << 56ull; - // fall through - case 7: - u |= (size_t)(p[6]) << 48ull; - // fall through - case 6: - u |= (size_t)(p[5]) << 40ull; - // fall through - case 5: - u |= (size_t)(p[4]) << 32ull; - // fall through - case 4: -#endif - u |= (size_t)(p[3]) << 24; - // fall through - case 3: - u |= (size_t)(p[2]) << 16; - // fall through - case 2: - u |= (size_t)(p[1]) << 8; - break; - case 1: - break; - } - if (u & ASCII_CHAR_MASK) { - return p - start + (ctz(u & ASCII_CHAR_MASK) - 7) / 8; - } - p += SIZEOF_SIZE_T; +#if HAVE_CTZ + size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK; + if (u) { + return p - start + (ctz(u) - 7) / 8; } return end - start; #else while (p < end) { - if ((unsigned char)*p & 0x80) { + if (*p & 0x80) { break; } p++; @@ -5204,7 +5172,7 @@ static Py_ssize_t utf8_count_codepoints(const unsigned char *s, const unsigned c { Py_ssize_t len = 0; - if (end - s > SIZEOF_SIZE_T + ALIGNOF_SIZE_T) { + if (end - s >= SIZEOF_SIZE_T) { while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) { len += scalar_utf8_start_char(*s++); } @@ -5235,6 +5203,39 @@ static Py_ssize_t utf8_count_codepoints(const unsigned char *s, const unsigned c return len; } +static Py_ssize_t +ascii_decode(const char *start, const char *end, Py_UCS1 *dest) +{ +#if SIZEOF_SIZE_T <= SIZEOF_VOID_P + if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T) + && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T)) + { + /* Fast path, see in STRINGLIB(utf8_decode) for + an explanation. */ + const char *p = start; + Py_UCS1 *q = dest; + while (p + SIZEOF_SIZE_T <= end) { + size_t value = *(const size_t *) p; + if (value & ASCII_CHAR_MASK) + break; + *((size_t *)q) = value; + p += SIZEOF_SIZE_T; + q += SIZEOF_SIZE_T; + } + while (p < end) { + if ((unsigned char)*p & 0x80) + break; + *q++ = *p++; + } + return p - start; + } +#endif + Py_ssize_t pos = find_first_nonascii((const unsigned char*)start, + (const unsigned char*)end); + memcpy(dest, start, pos); + return pos; +} + static int unicode_decode_utf8_impl(_PyUnicodeWriter *writer, const char *starts, const char *s, const char *end,