From 6c8aedcb71f7da0f9ba80f9d819f4b45e8eb5919 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 22 May 2024 12:46:37 +0200 Subject: [PATCH] gh-119182: Optimize PyUnicode_FromFormat() UTF-8 decoder Add unicode_decode_utf8_writer() to write directly characters into a _PyUnicodeWriter writer: avoid the creation of a temporary string. Optimize PyUnicode_FromFormat() by using the new unicode_decode_utf8_writer(). Rename unicode_fromformat_write_cstr() to unicode_fromformat_write_utf8(). Microbenchmark on the code: return PyUnicode_FromFormat( "%s %s %s %s %s.", "format", "multiple", "utf8", "short", "strings"); Result: 620 ns +- 8 ns -> 382 ns +- 2 ns: 1.62x faster. --- Objects/unicodeobject.c | 154 +++++++++++++++++++++++++--------------- 1 file changed, 96 insertions(+), 58 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 057b417074ebeaf..842b64ab57e793c 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -201,6 +201,11 @@ static PyObject * unicode_decode_utf8(const char *s, Py_ssize_t size, _Py_error_handler error_handler, const char *errors, Py_ssize_t *consumed); +static int +unicode_decode_utf8_writer(_PyUnicodeWriter *writer, + const char *s, Py_ssize_t size, + _Py_error_handler error_handler, const char *errors, + Py_ssize_t *consumed); #ifdef Py_DEBUG static inline int unicode_is_finalizing(void); static int unicode_is_singleton(PyObject *unicode); @@ -2376,14 +2381,11 @@ unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str, } static int -unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str, +unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str, Py_ssize_t width, Py_ssize_t precision, int flags) { /* UTF-8 */ Py_ssize_t length; - PyObject *unicode; - int res; - if (precision == -1) { length = strlen(str); } @@ -2393,13 +2395,22 @@ unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str, length++; } } - unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL); - if (unicode == NULL) - return -1; - res = unicode_fromformat_write_str(writer, unicode, width, -1, flags); - Py_DECREF(unicode); - return res; + if (width < 0) { + return unicode_decode_utf8_writer(writer, str, length, + _Py_ERROR_UNKNOWN, "replace", NULL); + } + else { + PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length, + "replace", NULL); + if (unicode == NULL) + return -1; + + int res = unicode_fromformat_write_str(writer, unicode, + width, -1, flags); + Py_DECREF(unicode); + return res; + } } static int @@ -2699,7 +2710,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, else { /* UTF-8 */ const char *s = va_arg(*vargs, const char*); - if (unicode_fromformat_write_cstr(writer, s, width, precision, flags) < 0) + if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0) return NULL; } break; @@ -2738,7 +2749,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, } else { assert(str != NULL); - if (unicode_fromformat_write_cstr(writer, str, width, precision, flags) < 0) + if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0) return NULL; } break; @@ -4736,46 +4747,37 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest) return p - start; } -static PyObject * -unicode_decode_utf8(const char *s, Py_ssize_t size, - _Py_error_handler error_handler, const char *errors, - Py_ssize_t *consumed) -{ - if (size == 0) { - if (consumed) - *consumed = 0; - _Py_RETURN_UNICODE_EMPTY(); - } - - /* ASCII is equivalent to the first 128 ordinals in Unicode. */ - if (size == 1 && (unsigned char)s[0] < 128) { - if (consumed) { - *consumed = 1; - } - return get_latin1_char((unsigned char)s[0]); - } +static int +unicode_decode_utf8_writer(_PyUnicodeWriter *writer, + const char *s, Py_ssize_t size, + _Py_error_handler error_handler, const char *errors, + Py_ssize_t *consumed) +{ const char *starts = s; const char *end = s + size; // fast path: try ASCII string. - PyObject *u = PyUnicode_New(size, 127); - if (u == NULL) { - return NULL; + if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) { + return -1; } - s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u)); - if (s == end) { - if (consumed) { - *consumed = size; + + Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind; + if (writer->kind == PyUnicode_1BYTE_KIND + && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T)) + { + Py_ssize_t decoded = ascii_decode(s, end, dest); + writer->pos += decoded; + + if (decoded == size) { + if (consumed) { + *consumed = size; + } + return 0; } - return u; + s += decoded; } - // Use _PyUnicodeWriter after fast path is failed. - _PyUnicodeWriter writer; - _PyUnicodeWriter_InitWithBuffer(&writer, u); - writer.pos = s - starts; - Py_ssize_t startinpos, endinpos; const char *errmsg = ""; PyObject *error_handler_obj = NULL; @@ -4783,18 +4785,18 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, while (s < end) { Py_UCS4 ch; - int kind = writer.kind; + int kind = writer->kind; if (kind == PyUnicode_1BYTE_KIND) { - if (PyUnicode_IS_ASCII(writer.buffer)) - ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos); + if (PyUnicode_IS_ASCII(writer->buffer)) + ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos); else - ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos); + ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos); } else if (kind == PyUnicode_2BYTE_KIND) { - ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos); + ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos); } else { assert(kind == PyUnicode_4BYTE_KIND); - ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos); + ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos); } switch (ch) { @@ -4825,7 +4827,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, endinpos = startinpos + ch - 1; break; default: - if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) + if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) goto onError; continue; } @@ -4839,7 +4841,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, break; case _Py_ERROR_REPLACE: - if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0) + if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0) goto onError; s += (endinpos - startinpos); break; @@ -4848,13 +4850,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, { Py_ssize_t i; - if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0) + if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0) goto onError; for (i=startinpos; ikind, writer->data, writer->pos, ch + 0xdc00); - writer.pos++; + writer->pos++; } s += (endinpos - startinpos); break; @@ -4865,8 +4867,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, errors, &error_handler_obj, "utf-8", errmsg, &starts, &end, &startinpos, &endinpos, &exc, &s, - &writer)) + writer)) { goto onError; + } + + if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) { + return -1; + } } } @@ -4876,13 +4883,44 @@ unicode_decode_utf8(const char *s, Py_ssize_t size, Py_XDECREF(error_handler_obj); Py_XDECREF(exc); - return _PyUnicodeWriter_Finish(&writer); + return 0; onError: Py_XDECREF(error_handler_obj); Py_XDECREF(exc); - _PyUnicodeWriter_Dealloc(&writer); - return NULL; + return -1; +} + + +static PyObject * +unicode_decode_utf8(const char *s, Py_ssize_t size, + _Py_error_handler error_handler, const char *errors, + Py_ssize_t *consumed) +{ + if (size == 0) { + if (consumed) + *consumed = 0; + _Py_RETURN_UNICODE_EMPTY(); + } + + /* ASCII is equivalent to the first 128 ordinals in Unicode. */ + if (size == 1 && (unsigned char)s[0] < 128) { + if (consumed) { + *consumed = 1; + } + return get_latin1_char((unsigned char)s[0]); + } + + _PyUnicodeWriter writer; + _PyUnicodeWriter_Init(&writer); + + if (unicode_decode_utf8_writer(&writer, s, size, + error_handler, errors, + consumed) < 0) { + _PyUnicodeWriter_Dealloc(&writer); + return NULL; + } + return _PyUnicodeWriter_Finish(&writer); }