From 435e891b32318b4a0fed2843d0cb37ee21c07e4b Mon Sep 17 00:00:00 2001 From: "Miss Islington (bot)" <31488909+miss-islington@users.noreply.github.com> Date: Wed, 10 Jan 2024 14:24:17 +0100 Subject: [PATCH] [3.11] gh-113594: Fix UnicodeEncodeError in TokenList.fold() (GH-113730) (GH-113908) It occurred when try to re-encode an unknown-8bit part combined with non-unknown-8bit part. (cherry picked from commit e9d5b6ea2d68564f176fdf70c2d7028e060c62b5) Co-authored-by: Serhiy Storchaka --- Lib/email/_header_value_parser.py | 7 ++++ .../test_email/test__header_value_parser.py | 39 +++++++++++++++++++ ...-01-05-12-42-07.gh-issue-113594.4t8HiR.rst | 2 + 3 files changed, 48 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2024-01-05-12-42-07.gh-issue-113594.4t8HiR.rst diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index e637e6df06612d..f4334f1fe69cbe 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -2768,6 +2768,7 @@ def _refold_parse_tree(parse_tree, *, policy): encoding = 'utf-8' if policy.utf8 else 'us-ascii' lines = [''] last_ew = None + last_charset = None wrap_as_ew_blocked = 0 want_encoding = False end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked') @@ -2822,8 +2823,14 @@ def _refold_parse_tree(parse_tree, *, policy): else: # It's a terminal, wrap it as an encoded word, possibly # combining it with previously encoded words if allowed. + if (last_ew is not None and + charset != last_charset and + (last_charset == 'unknown-8bit' or + last_charset == 'utf-8' and charset != 'us-ascii')): + last_ew = None last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew, part.ew_combine_allowed, charset) + last_charset = charset want_encoding = False continue if len(tstr) <= maxlen - len(lines[-1]): diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 854f2ff009c618..bdb0e55f21069f 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2915,6 +2915,45 @@ def test_ews_combined_before_wrap(self): "mich. And that's\n" " all I'm sayin.\n") + def test_unicode_after_unknown_not_combined(self): + self._test(parser.get_unstructured("=?unknown-8bit?q?=A4?=\xa4"), + "=?unknown-8bit?q?=A4?==?utf-8?q?=C2=A4?=\n") + prefix = "0123456789 "*5 + self._test(parser.get_unstructured(prefix + "=?unknown-8bit?q?=A4?=\xa4"), + prefix + "=?unknown-8bit?q?=A4?=\n =?utf-8?q?=C2=A4?=\n") + + def test_ascii_after_unknown_not_combined(self): + self._test(parser.get_unstructured("=?unknown-8bit?q?=A4?=abc"), + "=?unknown-8bit?q?=A4?=abc\n") + prefix = "0123456789 "*5 + self._test(parser.get_unstructured(prefix + "=?unknown-8bit?q?=A4?=abc"), + prefix + "=?unknown-8bit?q?=A4?=\n =?utf-8?q?abc?=\n") + + def test_unknown_after_unicode_not_combined(self): + self._test(parser.get_unstructured("\xa4" + "=?unknown-8bit?q?=A4?="), + "=?utf-8?q?=C2=A4?==?unknown-8bit?q?=A4?=\n") + prefix = "0123456789 "*5 + self._test(parser.get_unstructured(prefix + "\xa4=?unknown-8bit?q?=A4?="), + prefix + "=?utf-8?q?=C2=A4?=\n =?unknown-8bit?q?=A4?=\n") + + def test_unknown_after_ascii_not_combined(self): + self._test(parser.get_unstructured("abc" + "=?unknown-8bit?q?=A4?="), + "abc=?unknown-8bit?q?=A4?=\n") + prefix = "0123456789 "*5 + self._test(parser.get_unstructured(prefix + "abcd=?unknown-8bit?q?=A4?="), + prefix + "abcd\n =?unknown-8bit?q?=A4?=\n") + + def test_unknown_after_unknown(self): + self._test(parser.get_unstructured("=?unknown-8bit?q?=C2?=" + "=?unknown-8bit?q?=A4?="), + "=?unknown-8bit?q?=C2=A4?=\n") + prefix = "0123456789 "*5 + self._test(parser.get_unstructured(prefix + "=?unknown-8bit?q?=C2?=" + "=?unknown-8bit?q?=A4?="), + prefix + "=?unknown-8bit?q?=C2?=\n =?unknown-8bit?q?=A4?=\n") + # XXX Need test of an encoded word so long that it needs to be wrapped def test_simple_address(self): diff --git a/Misc/NEWS.d/next/Library/2024-01-05-12-42-07.gh-issue-113594.4t8HiR.rst b/Misc/NEWS.d/next/Library/2024-01-05-12-42-07.gh-issue-113594.4t8HiR.rst new file mode 100644 index 00000000000000..c71bc9c20e4596 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-01-05-12-42-07.gh-issue-113594.4t8HiR.rst @@ -0,0 +1,2 @@ +Fix :exc:`UnicodeEncodeError` in :mod:`email` when re-fold lines that +contain unknown-8bit encoded part followed by non-unknown-8bit encoded part.