From 88d548f0e941df3b2f406c937f42447a2435008e Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 8 Aug 2022 14:15:20 +0300 Subject: [PATCH 1/2] gh-95781: More strict format string checking in PyUnicode_FromFormatV() An unrecognized format character in PyUnicode_FromFormat() and PyUnicode_FromFormatV() now sets a SystemError. In previous versions it caused all the rest of the format string to be copied as-is to the result string, and any extra arguments discarded. --- Doc/c-api/unicode.rst | 8 +++-- Doc/whatsnew/3.12.rst | 5 +++ Lib/test/test_unicode.py | 23 ++++++------- ...2-08-08-14-36-31.gh-issue-95781.W_G8YW.rst | 4 +++ Objects/unicodeobject.c | 33 ++++++------------- 5 files changed, 34 insertions(+), 39 deletions(-) create mode 100644 Misc/NEWS.d/next/C API/2022-08-08-14-36-31.gh-issue-95781.W_G8YW.rst diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 339ee35c7aa474..99afebd762a456 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -477,9 +477,6 @@ APIs: | | | :c:func:`PyObject_Repr`. | +-------------------+---------------------+----------------------------------+ - An unrecognized format character causes all the rest of the format string to be - copied as-is to the result string, and any extra arguments discarded. - .. note:: The width formatter unit is number of characters rather than bytes. The precision formatter unit is number of bytes for ``"%s"`` and @@ -500,6 +497,11 @@ APIs: Support width and precision formatter for ``"%s"``, ``"%A"``, ``"%U"``, ``"%V"``, ``"%S"``, ``"%R"`` added. + .. versionchanged:: 3.12 + An unrecognized format character now sets a :exc:`SystemError`. + In previous versions it caused all the rest of the format string to be + copied as-is to the result string, and any extra arguments discarded. + .. c:function:: PyObject* PyUnicode_FromFormatV(const char *format, va_list vargs) diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst index ddf9e1f6a59b47..be8c8b453a52b6 100644 --- a/Doc/whatsnew/3.12.rst +++ b/Doc/whatsnew/3.12.rst @@ -461,6 +461,11 @@ Porting to Python 3.12 :py:meth:`~class.__subclasses__` (using :c:func:`PyObject_CallMethod`, for example). +* An unrecognized format character in :c:func:`PyUnicode_FromFormat` and + :c:func:`PyUnicode_FromFormatV` now sets a :exc:`SystemError`. + In previous versions it caused all the rest of the format string to be + copied as-is to the result string, and any extra arguments discarded. + Deprecated ---------- diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 9765ed97a60a44..63bccb72e04646 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -2641,8 +2641,6 @@ def check_format(expected, format, *args): b'%c%c', c_int(0x10000), c_int(0x100000)) # test "%" - check_format('%', - b'%') check_format('%', b'%%') check_format('%s', @@ -2819,23 +2817,22 @@ def check_format(expected, format, *args): check_format('repr=abc\ufffd', b'repr=%V', None, b'abc\xff') - # not supported: copy the raw format string. these tests are just here - # to check for crashes and should not be considered as specifications - check_format('%s', - b'%1%s', b'abc') - check_format('%1abc', - b'%1abc') - check_format('%+i', - b'%+i', c_int(10)) - check_format('%.%s', - b'%.%s', b'abc') - # Issue #33817: empty strings check_format('', b'') check_format('', b'%s', b'') + # check for crashes + for fmt in (b'%', b'%0', b'%01', b'%.', b'%.1', + b'%0%s', b'%1%s', b'%.%s', b'%.1%s', b'%1abc', + b'%l', b'%ll', b'%z', b'%ls', b'%lls', b'%zs'): + with self.subTest(fmt=fmt): + self.assertRaisesRegex(SystemError, 'invalid format string', + PyUnicode_FromFormat, fmt, b'abc') + self.assertRaisesRegex(SystemError, 'invalid format string', + PyUnicode_FromFormat, b'%+i', c_int(10)) + # Test PyUnicode_AsWideChar() @support.cpython_only @unittest.skipIf(_testcapi is None, 'need _testcapi module') diff --git a/Misc/NEWS.d/next/C API/2022-08-08-14-36-31.gh-issue-95781.W_G8YW.rst b/Misc/NEWS.d/next/C API/2022-08-08-14-36-31.gh-issue-95781.W_G8YW.rst new file mode 100644 index 00000000000000..eb2fd7e9da3d81 --- /dev/null +++ b/Misc/NEWS.d/next/C API/2022-08-08-14-36-31.gh-issue-95781.W_G8YW.rst @@ -0,0 +1,4 @@ +An unrecognized format character in :c:func:`PyUnicode_FromFormat` and +:c:func:`PyUnicode_FromFormatV` now sets a :exc:`SystemError`. +In previous versions it caused all the rest of the format string to be +copied as-is to the result string, and any extra arguments discarded. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 7ff79953257ee6..184a2bfd5dd869 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2355,6 +2355,13 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, p = f; f++; + if (*f == '%') { + if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) + return NULL; + f++; + return f; + } + zeropad = 0; if (*f == '0') { zeropad = 1; @@ -2392,14 +2399,6 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, f++; } } - if (*f == '%') { - /* "%.3%s" => f points to "3" */ - f--; - } - } - if (*f == '\0') { - /* bogus format "%.123" => go backward, f points to "3" */ - f--; } /* Handle %ld, %lu, %lld and %llu. */ @@ -2423,7 +2422,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, ++f; } - if (f[1] == '\0') + if (f[0] != '\0' && f[1] == '\0') writer->overallocate = 0; switch (*f) { @@ -2616,21 +2615,9 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer, break; } - case '%': - if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0) - return NULL; - break; - default: - /* if we stumble upon an unknown formatting code, copy the rest - of the format string to the output string. (we cannot just - skip the code, since there's no way to know what's in the - argument list) */ - len = strlen(p); - if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1) - return NULL; - f = p+len; - return f; + PyErr_Format(PyExc_SystemError, "invalid format string: %s", p); + return NULL; } f++; From 9553449a48f44de8ec20c5929a745fab4a73bba8 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 8 Aug 2022 18:16:28 +0300 Subject: [PATCH 2/2] Add a reference. --- Doc/whatsnew/3.12.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst index be8c8b453a52b6..02215524cfd671 100644 --- a/Doc/whatsnew/3.12.rst +++ b/Doc/whatsnew/3.12.rst @@ -465,6 +465,7 @@ Porting to Python 3.12 :c:func:`PyUnicode_FromFormatV` now sets a :exc:`SystemError`. In previous versions it caused all the rest of the format string to be copied as-is to the result string, and any extra arguments discarded. + (Contributed by Serhiy Storchaka in :gh:`95781`.) Deprecated