Skip to content

Commit

Permalink
Fix encoding detection and exception on empty files (#2195)
Browse files Browse the repository at this point in the history
The encoding detection code was trying to catch encoding-related
exceptions when the file is opened. This doesn't make sense, because
at this point no data has been read, therefore no encoding errors can be
detected. Instead, catch encoding-related exceptions when the file
contents are read.

Also avoid bailing out with `Exception('Unknown encoding')` on empty
files.
  • Loading branch information
DimitriPapadopoulos authored Oct 12, 2022
1 parent ad64452 commit 900f186
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 22 deletions.
41 changes: 19 additions & 22 deletions codespell_lib/_codespell.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,30 +200,27 @@ def open_with_chardet(self, filename):
return lines, encoding

def open_with_internal(self, filename):
curr = 0
while True:
try:
f = codecs.open(filename, 'r', encoding=encodings[curr])
except UnicodeDecodeError:
if not self.quiet_level & QuietLevels.ENCODING:
print("WARNING: Decoding file using encoding=%s failed: %s"
% (encodings[curr], filename,), file=sys.stderr)
try:
print("WARNING: Trying next encoding %s"
% encodings[curr + 1], file=sys.stderr)
except IndexError:
pass

curr += 1
else:
lines = f.readlines()
f.close()
break
if not lines:
encoding = None
first_try = True
for encoding in encodings:
if first_try:
first_try = False
elif not self.quiet_level & QuietLevels.ENCODING:
print("WARNING: Trying next encoding %s"
% encoding, file=sys.stderr)
with codecs.open(filename, 'r', encoding=encoding) as f:
try:
lines = f.readlines()
except UnicodeDecodeError:
if not self.quiet_level & QuietLevels.ENCODING:
print("WARNING: Decoding file using encoding=%s "
"failed: %s" % (encoding, filename,),
file=sys.stderr)
else:
break
else:
raise Exception('Unknown encoding')

encoding = encodings[curr]

return lines, encoding

# -.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-
Expand Down
13 changes: 13 additions & 0 deletions codespell_lib/tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,19 @@ def test_encoding(tmpdir, capsys):
with open(f.name, 'ab') as f:
f.write(u'naieve\n'.encode('utf-8'))
assert cs.main(f.name) == 1
# Encoding detection (only try ISO 8859-1 because UTF-8 is the default)
with open(f.name, 'wb') as f:
f.write(b'Speling error, non-ASCII: h\xe9t\xe9rog\xe9n\xe9it\xe9\n')
# check warnings about wrong encoding are enabled with "-q 0"
code, stdout, stderr = cs.main('-q', '0', f.name, std=True, count=True)
assert code == 1
assert 'Speling' in stdout
assert 'iso-8859-1' in stderr
# check warnings about wrong encoding are disabled with "-q 1"
code, stdout, stderr = cs.main('-q', '1', f.name, std=True, count=True)
assert code == 1
assert 'Speling' in stdout
assert 'iso-8859-1' not in stderr
# Binary file warning
with open(f.name, 'wb') as f:
f.write(b'\x00\x00naiive\x00\x00')
Expand Down

0 comments on commit 900f186

Please sign in to comment.