From 0ac7974abde5e8984f87d327da1f03087d5de780 Mon Sep 17 00:00:00 2001
From: Michael Hanke <michael.hanke@gmail.com>
Date: Thu, 11 Jul 2024 06:27:02 +0200
Subject: [PATCH 1/2] fix(`decode_bytes`): always backslashreplace when asked
 to

The previous implementation enabled error handling in decoding only for
the segment of a bytestring that an exception was raised for.

However, it may well be that more decoding errors exist in other parts
of the bytestring. I have a complicated real-world case where this
happens, i.e. raising `UnicodeDecodeError` again, even though
`decode_bytes` was called with `backslash_replace=True`.  Unfortunately
the data is so large that I did not manage to catch the condition
exactly.

It seems to be a needless sophistication to decode some part of the
bytestring with error handling, but not another.

There is a good chance that this patch is badly interacting with the
logic to obtain the next chunk before attempting a decoding again.
---
 datalad_next/itertools/decode_bytes.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/datalad_next/itertools/decode_bytes.py b/datalad_next/itertools/decode_bytes.py
index 18ff3a16..1eca5a13 100644
--- a/datalad_next/itertools/decode_bytes.py
+++ b/datalad_next/itertools/decode_bytes.py
@@ -103,11 +103,8 @@ def handle_decoding_error(position: int,
         else:
             return (
                 position + exc.end,
-                joined_data[:position + exc.start].decode(encoding)
-                + joined_data[position + exc.start:position + exc.end].decode(
-                    encoding,
-                    errors='backslashreplace'
-                ),
+                joined_data[:position + exc.end].decode(
+                    encoding, errors='backslashreplace')
             )
 
     joined_data = b''

From 7a73d8cea219438e2436e05602d5b161947dccc3 Mon Sep 17 00:00:00 2001
From: Christian Monch <christian.moench@web.de>
Date: Thu, 11 Jul 2024 09:14:48 +0200
Subject: [PATCH 2/2] fix(`decode_bytes`): handle multiple errors

This commit fixes an issue in multiple error
handling where parts of the input strings were
repeated in the output of `decode_bytes`.

It also adds a regreesion test to enure that
multiple encoding errors in a single input
chunk are handled properly.
---
 datalad_next/itertools/decode_bytes.py            | 7 +++++--
 datalad_next/itertools/tests/test_decode_bytes.py | 5 +++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/datalad_next/itertools/decode_bytes.py b/datalad_next/itertools/decode_bytes.py
index 1eca5a13..bb2cca63 100644
--- a/datalad_next/itertools/decode_bytes.py
+++ b/datalad_next/itertools/decode_bytes.py
@@ -103,8 +103,11 @@ def handle_decoding_error(position: int,
         else:
             return (
                 position + exc.end,
-                joined_data[:position + exc.end].decode(
-                    encoding, errors='backslashreplace')
+                joined_data[position:position + exc.start].decode(encoding)
+                + joined_data[position + exc.start:position + exc.end].decode(
+                    encoding,
+                    errors='backslashreplace'
+                ),
             )
 
     joined_data = b''
diff --git a/datalad_next/itertools/tests/test_decode_bytes.py b/datalad_next/itertools/tests/test_decode_bytes.py
index a463cc46..6139f7ca 100644
--- a/datalad_next/itertools/tests/test_decode_bytes.py
+++ b/datalad_next/itertools/tests/test_decode_bytes.py
@@ -35,3 +35,8 @@ def test_no_empty_strings():
     # check that empty strings are not yielded
     r = tuple(decode_bytes([b'\xc3', b'\xb6']))
     assert r == ('ö',)
+
+
+def test_multiple_errors():
+    r = ''.join(decode_bytes([b'08 War \xaf No \xaf More \xaf Trouble.shn.mp3']))
+    assert r == '08 War \\xaf No \\xaf More \\xaf Trouble.shn.mp3'