From febcf130bf2c4275310ee0c909e1233e6d886a12 Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Sat, 11 Dec 2021 11:37:33 +0100 Subject: [PATCH] Fix encoding detection and exception on empty files The encoding detection code was trying to catch encoding-related exceptions when the file is opened. This doesn't make sense, because at this point no data has been read, therefore no encoding errors can be detected. Instead, catch encoding-related exceptions when the file contents are read. Also avoid bailing out with `Exception('Unknown encoding')` on empty files. --- codespell_lib/_codespell.py | 41 ++++++++++++++----------------- codespell_lib/tests/test_basic.py | 7 ++++++ 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 86cc2c07b19..d6f1046b6b0 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -200,30 +200,27 @@ def open_with_chardet(self, filename): return lines, encoding def open_with_internal(self, filename): - curr = 0 - while True: - try: - f = codecs.open(filename, 'r', encoding=encodings[curr]) - except UnicodeDecodeError: - if not self.quiet_level & QuietLevels.ENCODING: - print("WARNING: Decoding file using encoding=%s failed: %s" - % (encodings[curr], filename,), file=sys.stderr) - try: - print("WARNING: Trying next encoding %s" - % encodings[curr + 1], file=sys.stderr) - except IndexError: - pass - - curr += 1 - else: - lines = f.readlines() - f.close() - break - if not lines: + encoding = None + first_try = True + for encoding in encodings: + if first_try: + first_try = False + elif not self.quiet_level & QuietLevels.ENCODING: + print("WARNING: Trying next encoding %s" + % encoding, file=sys.stderr) + with codecs.open(filename, 'r', encoding=encoding) as f: + try: + lines = f.readlines() + except UnicodeDecodeError: + if not self.quiet_level & QuietLevels.ENCODING: + print("WARNING: Decoding file using encoding=%s " + "failed: %s" % (encoding, filename,), + file=sys.stderr) + else: + break + else: raise Exception('Unknown encoding') - encoding = encodings[curr] - return lines, encoding # -.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:- diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py index bbf2ea47ddb..ac33e9dfb04 100644 --- a/codespell_lib/tests/test_basic.py +++ b/codespell_lib/tests/test_basic.py @@ -272,6 +272,13 @@ def test_encoding(tmpdir, capsys): with open(f.name, 'ab') as f: f.write(u'naieve\n'.encode('utf-8')) assert cs.main(f.name) == 1 + # Encoding detection, (only try ISO 8859-1 because UTF-8 is the default) + with open(f.name, 'wb') as f: + f.write(b'Speling error, then non-ASCII: h\xe9t\xe9rog\xe9n\xe9it\xe9') + code, stdout, stderr = cs.main('-q', '0', f.name, std=True, count=False) + assert code == 1 + assert 'Speling' in stdout + assert 'iso-8859-1' in stderr # Binary file warning with open(f.name, 'wb') as f: f.write(b'\x00\x00naiive\x00\x00')