From 0ac7974abde5e8984f87d327da1f03087d5de780 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Thu, 11 Jul 2024 06:27:02 +0200 Subject: [PATCH 1/2] fix(`decode_bytes`): always backslashreplace when asked to The previous implementation enabled error handling in decoding only for the segment of a bytestring that an exception was raised for. However, it may well be that more decoding errors exist in other parts of the bytestring. I have a complicated real-world case where this happens, i.e. raising `UnicodeDecodeError` again, even though `decode_bytes` was called with `backslash_replace=True`. Unfortunately the data is so large that I did not manage to catch the condition exactly. It seems to be a needless sophistication to decode some part of the bytestring with error handling, but not another. There is a good chance that this patch is badly interacting with the logic to obtain the next chunk before attempting a decoding again. --- datalad_next/itertools/decode_bytes.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/datalad_next/itertools/decode_bytes.py b/datalad_next/itertools/decode_bytes.py index 18ff3a16..1eca5a13 100644 --- a/datalad_next/itertools/decode_bytes.py +++ b/datalad_next/itertools/decode_bytes.py @@ -103,11 +103,8 @@ def handle_decoding_error(position: int, else: return ( position + exc.end, - joined_data[:position + exc.start].decode(encoding) - + joined_data[position + exc.start:position + exc.end].decode( - encoding, - errors='backslashreplace' - ), + joined_data[:position + exc.end].decode( + encoding, errors='backslashreplace') ) joined_data = b'' From 7a73d8cea219438e2436e05602d5b161947dccc3 Mon Sep 17 00:00:00 2001 From: Christian Monch Date: Thu, 11 Jul 2024 09:14:48 +0200 Subject: [PATCH 2/2] fix(`decode_bytes`): handle multiple errors This commit fixes an issue in multiple error handling where parts of the input strings were repeated in the output of `decode_bytes`. It also adds a regreesion test to enure that multiple encoding errors in a single input chunk are handled properly. --- datalad_next/itertools/decode_bytes.py | 7 +++++-- datalad_next/itertools/tests/test_decode_bytes.py | 5 +++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/datalad_next/itertools/decode_bytes.py b/datalad_next/itertools/decode_bytes.py index 1eca5a13..bb2cca63 100644 --- a/datalad_next/itertools/decode_bytes.py +++ b/datalad_next/itertools/decode_bytes.py @@ -103,8 +103,11 @@ def handle_decoding_error(position: int, else: return ( position + exc.end, - joined_data[:position + exc.end].decode( - encoding, errors='backslashreplace') + joined_data[position:position + exc.start].decode(encoding) + + joined_data[position + exc.start:position + exc.end].decode( + encoding, + errors='backslashreplace' + ), ) joined_data = b'' diff --git a/datalad_next/itertools/tests/test_decode_bytes.py b/datalad_next/itertools/tests/test_decode_bytes.py index a463cc46..6139f7ca 100644 --- a/datalad_next/itertools/tests/test_decode_bytes.py +++ b/datalad_next/itertools/tests/test_decode_bytes.py @@ -35,3 +35,8 @@ def test_no_empty_strings(): # check that empty strings are not yielded r = tuple(decode_bytes([b'\xc3', b'\xb6'])) assert r == ('รถ',) + + +def test_multiple_errors(): + r = ''.join(decode_bytes([b'08 War \xaf No \xaf More \xaf Trouble.shn.mp3'])) + assert r == '08 War \\xaf No \\xaf More \\xaf Trouble.shn.mp3'