-
-
Notifications
You must be signed in to change notification settings - Fork 30.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
gh-99612: Fix PyUnicode_DecodeUTF8Stateful() for ASCII-only data #99613
Changes from 2 commits
8ff13de
b99be83
3d23080
1c07d9f
fd8c21d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import unittest | ||
from test import support | ||
|
||
try: | ||
import _testcapi | ||
except ImportError: | ||
_testcapi = None | ||
|
||
|
||
class CAPITest(unittest.TestCase): | ||
|
||
@support.cpython_only | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For me it's redundant with testing _testcapi. For example, PyPy doesn't have _testcapi. But you can keep it if you prefer to be explicit. |
||
@unittest.skipIf(_testcapi is None, 'need _testcapi module') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMO you can move it on the whole class. |
||
def test_decodeutf8(self): | ||
"""Test PyUnicode_DecodeUTF8()""" | ||
from _testcapi import unicode_decodeutf8 as decodeutf8 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You may use |
||
|
||
for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600']: | ||
b = s.encode('utf-8') | ||
self.assertEqual(decodeutf8(b), s) | ||
self.assertEqual(decodeutf8(b, 'strict'), s) | ||
|
||
self.assertRaises(UnicodeDecodeError, decodeutf8, b'\x80') | ||
self.assertRaises(UnicodeDecodeError, decodeutf8, b'\xc0') | ||
self.assertRaises(UnicodeDecodeError, decodeutf8, b'\xff') | ||
self.assertRaises(UnicodeDecodeError, decodeutf8, b'a\xf0\x9f') | ||
self.assertEqual(decodeutf8(b'a\xf0\x9f', 'replace'), 'a\ufffd') | ||
self.assertEqual(decodeutf8(b'a\xf0\x9fb', 'replace'), 'a\ufffdb') | ||
|
||
self.assertRaises(LookupError, decodeutf8, b'a\x80', 'foo') | ||
# TODO: Test PyUnicode_DecodeUTF8() with NULL as data and | ||
# negative size. | ||
|
||
@support.cpython_only | ||
@unittest.skipIf(_testcapi is None, 'need _testcapi module') | ||
def test_decodeutf8stateful(self): | ||
"""Test PyUnicode_DecodeUTF8Stateful()""" | ||
from _testcapi import unicode_decodeutf8stateful as decodeutf8stateful | ||
|
||
for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600']: | ||
b = s.encode('utf-8') | ||
self.assertEqual(decodeutf8stateful(b), (s, len(b))) | ||
self.assertEqual(decodeutf8stateful(b, 'strict'), (s, len(b))) | ||
|
||
self.assertRaises(UnicodeDecodeError, decodeutf8stateful, b'\x80') | ||
self.assertRaises(UnicodeDecodeError, decodeutf8stateful, b'\xc0') | ||
self.assertRaises(UnicodeDecodeError, decodeutf8stateful, b'\xff') | ||
self.assertEqual(decodeutf8stateful(b'a\xf0\x9f'), ('a', 1)) | ||
self.assertEqual(decodeutf8stateful(b'a\xf0\x9f', 'replace'), ('a', 1)) | ||
self.assertRaises(UnicodeDecodeError, decodeutf8stateful, b'a\xf0\x9fb') | ||
self.assertEqual(decodeutf8stateful(b'a\xf0\x9fb', 'replace'), ('a\ufffdb', 4)) | ||
|
||
self.assertRaises(LookupError, decodeutf8stateful, b'a\x80', 'foo') | ||
# TODO: Test PyUnicode_DecodeUTF8Stateful() with NULL as data and | ||
# negative size. | ||
# TODO: Test PyUnicode_DecodeUTF8Stateful() with NULL as the address of | ||
# "consumed". | ||
|
||
|
||
if __name__ == "__main__": | ||
unittest.main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
Fix :c:func:`PyUnicode_DecodeUTF8Stateful` for ASCII-only data: | ||
``*consumed`` was not set. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
#define PY_SSIZE_T_CLEAN | ||
#include "parts.h" | ||
|
||
static struct PyModuleDef *_testcapimodule = NULL; // set at initialization | ||
|
@@ -223,6 +224,40 @@ unicode_asutf8andsize(PyObject *self, PyObject *args) | |
return Py_BuildValue("(Nn)", result, utf8_len); | ||
} | ||
|
||
/* Test PyUnicode_DecodeUTF8() */ | ||
static PyObject * | ||
unicode_decodeutf8(PyObject *self, PyObject *args) | ||
{ | ||
const char *data; | ||
Py_ssize_t size; | ||
const char *errors = NULL; | ||
|
||
if (!PyArg_ParseTuple(args, "y#|z", &data, &size, &errors)) | ||
return NULL; | ||
|
||
return PyUnicode_DecodeUTF8(data, size, errors); | ||
} | ||
|
||
/* Test PyUnicode_DecodeUTF8Stateful() */ | ||
static PyObject * | ||
unicode_decodeutf8stateful(PyObject *self, PyObject *args) | ||
{ | ||
const char *data; | ||
Py_ssize_t size; | ||
const char *errors = NULL; | ||
Py_ssize_t consumed; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would feel safer if you initialize the value. Maybe to a marker value like 42? Otherwise, the test may miss the bug by luck, if local variables allocated on the stack are initialize to 0. |
||
PyObject *result; | ||
|
||
if (!PyArg_ParseTuple(args, "y#|z", &data, &size, &errors)) | ||
return NULL; | ||
|
||
result = PyUnicode_DecodeUTF8Stateful(data, size, errors, &consumed); | ||
if (!result) { | ||
return NULL; | ||
} | ||
return Py_BuildValue("(Nn)", result, consumed); | ||
} | ||
|
||
static PyObject * | ||
unicode_count(PyObject *self, PyObject *args) | ||
{ | ||
|
@@ -716,6 +751,8 @@ static PyMethodDef TestMethods[] = { | |
{"unicode_asucs4", unicode_asucs4, METH_VARARGS}, | ||
{"unicode_asutf8", unicode_asutf8, METH_VARARGS}, | ||
{"unicode_asutf8andsize", unicode_asutf8andsize, METH_VARARGS}, | ||
{"unicode_decodeutf8", unicode_decodeutf8, METH_VARARGS}, | ||
{"unicode_decodeutf8stateful",unicode_decodeutf8stateful, METH_VARARGS}, | ||
{"unicode_count", unicode_count, METH_VARARGS}, | ||
{"unicode_findchar", unicode_findchar, METH_VARARGS}, | ||
{"unicode_copycharacters", unicode_copycharacters, METH_VARARGS}, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not skipping this test if the module is missing?