From 474f302ae8f0bb017e12fd00920ab20ed544f8a6 Mon Sep 17 00:00:00 2001 From: Adrian Clay Date: Wed, 6 Sep 2023 16:12:56 +0100 Subject: [PATCH] NIAD-2822: Fix issue where incoming messages containing multibyte UTF-8 characters are mangled (#137) * Create failing unit tests to reproduce NIAD-2822 * Use message_from_bytes method --- .../mhs_common/messages/ebxml_request_envelope.py | 2 +- .../messages/tests/test_ebxml_request_envelope.py | 6 ++++++ .../ebxml_request_multibyte_character.ebxml | 1 + .../ebxml_request_multibyte_character.msg | 12 ++++++++++++ 4 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 mhs/common/mhs_common/messages/tests/test_messages/ebxml_request_multibyte_character.ebxml create mode 100644 mhs/common/mhs_common/messages/tests/test_messages/ebxml_request_multibyte_character.msg diff --git a/mhs/common/mhs_common/messages/ebxml_request_envelope.py b/mhs/common/mhs_common/messages/ebxml_request_envelope.py index 0971c086f..8a16b6f3b 100644 --- a/mhs/common/mhs_common/messages/ebxml_request_envelope.py +++ b/mhs/common/mhs_common/messages/ebxml_request_envelope.py @@ -191,7 +191,7 @@ def _parse_mime_message(headers: Dict[str, str], message: str) -> email.message. """ content_type_header = f'{HttpHeaders.CONTENT_TYPE}: {headers[HttpHeaders.CONTENT_TYPE]}\r\n\r\n' - msg = email.message_from_string(content_type_header + message, policy=email.policy.HTTP) + msg = email.message_from_bytes(bytes(content_type_header + message, 'utf-8'), policy=email.policy.HTTP) if msg.defects: logger.warning('Found defects in MIME message during parsing. {Defects}', diff --git a/mhs/common/mhs_common/messages/tests/test_ebxml_request_envelope.py b/mhs/common/mhs_common/messages/tests/test_ebxml_request_envelope.py index ee5f4447e..afcd87ba6 100644 --- a/mhs/common/mhs_common/messages/tests/test_ebxml_request_envelope.py +++ b/mhs/common/mhs_common/messages/tests/test_ebxml_request_envelope.py @@ -331,6 +331,12 @@ def test_from_string_parses_valid_requests(self): self.assertEqual(expected_values_with_payload, parsed_message.message_dictionary) + with self.subTest("A valid request containing multibyte UTF8 characters within HL7 XML"): + # Regression test for NIAD-2822 + message, _ = message_utilities.load_test_data(self.message_dir, 'ebxml_request_multibyte_character') + parsed_message = ebxml_request_envelope.EbxmlRequestEnvelope.from_string(MULTIPART_MIME_HEADERS, message) + self.assertEquals(parsed_message.message_dictionary['hl7_message'], "¬ ❤️ 🧸") + with self.subTest("A valid request containing one textual attachment with no description provided"): message, ebxml = message_utilities.load_test_data(self.message_dir, 'ebxml_request_one_attachment_application_xml_content_type_no_description') attachments = [{ diff --git a/mhs/common/mhs_common/messages/tests/test_messages/ebxml_request_multibyte_character.ebxml b/mhs/common/mhs_common/messages/tests/test_messages/ebxml_request_multibyte_character.ebxml new file mode 100644 index 000000000..ed54a42e0 --- /dev/null +++ b/mhs/common/mhs_common/messages/tests/test_messages/ebxml_request_multibyte_character.ebxml @@ -0,0 +1 @@ +5EP-807264YGMYW-8229935f0ba4c0967d2a9ca9b3018A2D28-A043-73C9-BAE9-5CB491815950urn:nhs:names:services:gp2gpRCMR_IN030000UK06483E788A-3464-46E3-ACE3-E15DB68D8D362023-08-25T14:45:57.571Z2023-08-25T21:00:57.571ZRCMR_IN030000UK068231C804-9E4B-451E-8323-5FC86FEE6C2F_MARBLES.png \ No newline at end of file diff --git a/mhs/common/mhs_common/messages/tests/test_messages/ebxml_request_multibyte_character.msg b/mhs/common/mhs_common/messages/tests/test_messages/ebxml_request_multibyte_character.msg new file mode 100644 index 000000000..7fd640a82 --- /dev/null +++ b/mhs/common/mhs_common/messages/tests/test_messages/ebxml_request_multibyte_character.msg @@ -0,0 +1,12 @@ +----=_MIME-Boundary +Content-Id: +Content-Type: text/xml; charset=UTF-8 + +{{ebxml}} +----=_MIME-Boundary +Content-Id: +Content-Transfer-Encoding: 8bit +Content-Type: application/xml; charset=UTF-8 + +¬ ❤️ 🧸 +----=_MIME-Boundary--