From febcf130bf2c4275310ee0c909e1233e6d886a12 Mon Sep 17 00:00:00 2001
From: Dimitri Papadopoulos
 <3234522+DimitriPapadopoulos@users.noreply.github.com>
Date: Sat, 11 Dec 2021 11:37:33 +0100
Subject: [PATCH] Fix encoding detection and exception on empty files

The encoding detection code was trying to catch encoding-related
exceptions when the file is opened. This doesn't make sense, because
at this point no data has been read, therefore no encoding errors can be
detected. Instead, catch encoding-related exceptions when the file
contents are read.

Also avoid bailing out with `Exception('Unknown encoding')` on empty
files.
---
 codespell_lib/_codespell.py       | 41 ++++++++++++++-----------------
 codespell_lib/tests/test_basic.py |  7 ++++++
 2 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py
index 86cc2c07b19..d6f1046b6b0 100644
--- a/codespell_lib/_codespell.py
+++ b/codespell_lib/_codespell.py
@@ -200,30 +200,27 @@ def open_with_chardet(self, filename):
         return lines, encoding
 
     def open_with_internal(self, filename):
-        curr = 0
-        while True:
-            try:
-                f = codecs.open(filename, 'r', encoding=encodings[curr])
-            except UnicodeDecodeError:
-                if not self.quiet_level & QuietLevels.ENCODING:
-                    print("WARNING: Decoding file using encoding=%s failed: %s"
-                          % (encodings[curr], filename,), file=sys.stderr)
-                    try:
-                        print("WARNING: Trying next encoding %s"
-                              % encodings[curr + 1], file=sys.stderr)
-                    except IndexError:
-                        pass
-
-                curr += 1
-            else:
-                lines = f.readlines()
-                f.close()
-                break
-        if not lines:
+        encoding = None
+        first_try = True
+        for encoding in encodings:
+            if first_try:
+                first_try = False
+            elif not self.quiet_level & QuietLevels.ENCODING:
+                print("WARNING: Trying next encoding %s"
+                      % encoding, file=sys.stderr)
+            with codecs.open(filename, 'r', encoding=encoding) as f:
+                try:
+                    lines = f.readlines()
+                except UnicodeDecodeError:
+                    if not self.quiet_level & QuietLevels.ENCODING:
+                        print("WARNING: Decoding file using encoding=%s "
+                              "failed: %s" % (encoding, filename,),
+                              file=sys.stderr)
+                else:
+                    break
+        else:
             raise Exception('Unknown encoding')
 
-        encoding = encodings[curr]
-
         return lines, encoding
 
 # -.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-
diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py
index bbf2ea47ddb..ac33e9dfb04 100644
--- a/codespell_lib/tests/test_basic.py
+++ b/codespell_lib/tests/test_basic.py
@@ -272,6 +272,13 @@ def test_encoding(tmpdir, capsys):
     with open(f.name, 'ab') as f:
         f.write(u'naieve\n'.encode('utf-8'))
     assert cs.main(f.name) == 1
+    # Encoding detection, (only try ISO 8859-1 because UTF-8 is the default)
+    with open(f.name, 'wb') as f:
+        f.write(b'Speling error, then non-ASCII: h\xe9t\xe9rog\xe9n\xe9it\xe9')
+    code, stdout, stderr = cs.main('-q', '0', f.name, std=True, count=False)
+    assert code == 1
+    assert 'Speling' in stdout
+    assert 'iso-8859-1' in stderr
     # Binary file warning
     with open(f.name, 'wb') as f:
         f.write(b'\x00\x00naiive\x00\x00')