Skip to content

Commit

Permalink
[3.11] gh-113594: Fix UnicodeEncodeError in TokenList.fold() (GH-113730
Browse files Browse the repository at this point in the history
…) (GH-113908)

It occurred when try to re-encode an unknown-8bit part combined with non-unknown-8bit part.
(cherry picked from commit e9d5b6e)

Co-authored-by: Serhiy Storchaka <[email protected]>
  • Loading branch information
miss-islington and serhiy-storchaka authored Jan 10, 2024
1 parent c92a473 commit 435e891
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 0 deletions.
7 changes: 7 additions & 0 deletions Lib/email/_header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2768,6 +2768,7 @@ def _refold_parse_tree(parse_tree, *, policy):
encoding = 'utf-8' if policy.utf8 else 'us-ascii'
lines = ['']
last_ew = None
last_charset = None
wrap_as_ew_blocked = 0
want_encoding = False
end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
Expand Down Expand Up @@ -2822,8 +2823,14 @@ def _refold_parse_tree(parse_tree, *, policy):
else:
# It's a terminal, wrap it as an encoded word, possibly
# combining it with previously encoded words if allowed.
if (last_ew is not None and
charset != last_charset and
(last_charset == 'unknown-8bit' or
last_charset == 'utf-8' and charset != 'us-ascii')):
last_ew = None
last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
part.ew_combine_allowed, charset)
last_charset = charset
want_encoding = False
continue
if len(tstr) <= maxlen - len(lines[-1]):
Expand Down
39 changes: 39 additions & 0 deletions Lib/test/test_email/test__header_value_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2915,6 +2915,45 @@ def test_ews_combined_before_wrap(self):
"mich. And that's\n"
" all I'm sayin.\n")

def test_unicode_after_unknown_not_combined(self):
self._test(parser.get_unstructured("=?unknown-8bit?q?=A4?=\xa4"),
"=?unknown-8bit?q?=A4?==?utf-8?q?=C2=A4?=\n")
prefix = "0123456789 "*5
self._test(parser.get_unstructured(prefix + "=?unknown-8bit?q?=A4?=\xa4"),
prefix + "=?unknown-8bit?q?=A4?=\n =?utf-8?q?=C2=A4?=\n")

def test_ascii_after_unknown_not_combined(self):
self._test(parser.get_unstructured("=?unknown-8bit?q?=A4?=abc"),
"=?unknown-8bit?q?=A4?=abc\n")
prefix = "0123456789 "*5
self._test(parser.get_unstructured(prefix + "=?unknown-8bit?q?=A4?=abc"),
prefix + "=?unknown-8bit?q?=A4?=\n =?utf-8?q?abc?=\n")

def test_unknown_after_unicode_not_combined(self):
self._test(parser.get_unstructured("\xa4"
"=?unknown-8bit?q?=A4?="),
"=?utf-8?q?=C2=A4?==?unknown-8bit?q?=A4?=\n")
prefix = "0123456789 "*5
self._test(parser.get_unstructured(prefix + "\xa4=?unknown-8bit?q?=A4?="),
prefix + "=?utf-8?q?=C2=A4?=\n =?unknown-8bit?q?=A4?=\n")

def test_unknown_after_ascii_not_combined(self):
self._test(parser.get_unstructured("abc"
"=?unknown-8bit?q?=A4?="),
"abc=?unknown-8bit?q?=A4?=\n")
prefix = "0123456789 "*5
self._test(parser.get_unstructured(prefix + "abcd=?unknown-8bit?q?=A4?="),
prefix + "abcd\n =?unknown-8bit?q?=A4?=\n")

def test_unknown_after_unknown(self):
self._test(parser.get_unstructured("=?unknown-8bit?q?=C2?="
"=?unknown-8bit?q?=A4?="),
"=?unknown-8bit?q?=C2=A4?=\n")
prefix = "0123456789 "*5
self._test(parser.get_unstructured(prefix + "=?unknown-8bit?q?=C2?="
"=?unknown-8bit?q?=A4?="),
prefix + "=?unknown-8bit?q?=C2?=\n =?unknown-8bit?q?=A4?=\n")

# XXX Need test of an encoded word so long that it needs to be wrapped

def test_simple_address(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix :exc:`UnicodeEncodeError` in :mod:`email` when re-fold lines that
contain unknown-8bit encoded part followed by non-unknown-8bit encoded part.

0 comments on commit 435e891

Please sign in to comment.