diff --git a/tests/common.py b/tests/common.py
index 78bb9cbf..579e853f 100644
--- a/tests/common.py
+++ b/tests/common.py
@@ -13,11 +13,16 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
+import codecs
+from codecs import CodecInfo as CI
+import collections
import contextlib
import os
import shutil
+import sys
import tempfile
import unittest
+import warnings
import yaml
@@ -25,6 +30,7 @@
from yamllint import linter
+# Rule related stuff:
class RuleTestCase(unittest.TestCase):
def build_fake_config(self, conf):
if conf is None:
@@ -54,6 +60,10 @@ def check(self, source, conf, **kwargs):
self.assertEqual(real_problems, expected_problems)
+# Workspace related stuff:
+Blob = collections.namedtuple('Blob', ('text', 'encoding'))
+
+
def build_temp_workspace(files):
tempdir = tempfile.mkdtemp(prefix='yamllint-tests-')
@@ -65,6 +75,8 @@ def build_temp_workspace(files):
if type(content) is list:
os.mkdir(path)
else:
+ if isinstance(content, Blob):
+ content = content.text.encode(content.encoding)
mode = 'wb' if isinstance(content, bytes) else 'w'
with open(path, mode) as f:
f.write(content)
@@ -84,3 +96,98 @@ def temp_workspace(files):
finally:
os.chdir(backup_wd)
shutil.rmtree(wd)
+
+
+# Encoding related stuff:
+def encode_utf_32_be_sig(obj, errors='strict'):
+ return (
+ codecs.BOM_UTF32_BE + codecs.encode(obj, 'utf_32_be', errors),
+ len(obj)
+ )
+
+
+def encode_utf_32_le_sig(obj, errors='strict'):
+ return (
+ codecs.BOM_UTF32_LE + codecs.encode(obj, 'utf_32_le', errors),
+ len(obj)
+ )
+
+
+def encode_utf_16_be_sig(obj, errors='strict'):
+ return (
+ codecs.BOM_UTF16_BE + codecs.encode(obj, 'utf_16_be', errors),
+ len(obj)
+ )
+
+
+def encode_utf_16_le_sig(obj, errors='strict'):
+ return (
+ codecs.BOM_UTF16_LE + codecs.encode(obj, 'utf_16_le', errors),
+ len(obj)
+ )
+
+
+test_codec_infos = {
+ 'utf_32_be_sig': CI(encode_utf_32_be_sig, codecs.getdecoder('utf_32')),
+ 'utf_32_le_sig': CI(encode_utf_32_le_sig, codecs.getdecoder('utf_32')),
+ 'utf_16_be_sig': CI(encode_utf_16_be_sig, codecs.getdecoder('utf_16')),
+ 'utf_16_le_sig': CI(encode_utf_16_le_sig, codecs.getdecoder('utf_16')),
+}
+
+
+def register_test_codecs():
+ codecs.register(test_codec_infos.get)
+
+
+def unregister_test_codecs():
+ if sys.version_info >= (3, 10, 0):
+ codecs.unregister(test_codec_infos.get)
+ else:
+ warnings.warn(
+ "This version of Python doesn’t allow us to unregister codecs."
+ )
+
+
+def is_test_codec(codec):
+ return codec in test_codec_infos.keys()
+
+
+def test_codec_built_in_equivalent(test_codec):
+ return_value = test_codec
+ for suffix in ('_sig', '_be', '_le'):
+ return_value = return_value.replace(suffix, '')
+ return return_value
+
+
+def uses_bom(codec):
+ for suffix in ('_32', '_16', '_sig'):
+ if codec.endswith(suffix):
+ return True
+ return False
+
+
+def encoding_detectable(string, codec):
+ """
+ Returns True if encoding can be detected after string is encoded
+
+ Encoding detection only works if you’re using a BOM or the first character
+ is ASCII. See yamllint.decoder.auto_decode()’s docstring.
+ """
+ return uses_bom(codec) or (len(string) > 0 and string[0].isascii())
+
+
+def utf_codecs():
+ for chunk_size in ('32', '16'):
+ for endianness in ('be', 'le'):
+ for sig in ('', '_sig'):
+ yield f'utf_{chunk_size}_{endianness}{sig}'
+ yield 'utf_8_sig'
+ yield 'utf_8'
+
+
+def ws_with_files_in_many_codecs(path_template, text):
+ workspace = {}
+ for codec in utf_codecs():
+ if encoding_detectable(text, codec):
+ workspace[path_template.format(codec)] = Blob(text, codec)
+ return workspace
diff --git a/tests/test_cli.py b/tests/test_cli.py
index d158e326..3922eb5e 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -23,7 +23,9 @@
import tempfile
import unittest
-from tests.common import build_temp_workspace, temp_workspace
+from tests.common import (build_temp_workspace, temp_workspace,
+ ws_with_files_in_many_codecs,
+ register_test_codecs, unregister_test_codecs)
from yamllint import cli
from yamllint import config
@@ -797,3 +799,52 @@ def test_multiple_parent_config_file(self):
self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr),
(0, './4spaces.yml:2:5: [warning] wrong indentation: '
'expected 3 but found 4 (indentation)\n', ''))
+
+
+class CommandLineEncodingTestCase(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ super().setUpClass()
+ register_test_codecs()
+
+ @classmethod
+ def tearDownClass(cls):
+ super().tearDownClass()
+ unregister_test_codecs()
+
+ def test_valid_encodings(self):
+ conf = ('---\n'
+ 'rules:\n'
+ ' key-ordering: enable\n')
+ config_files = ws_with_files_in_many_codecs(
+ 'config_{}.yaml',
+ conf
+ )
+ sorted_correctly = ('---\n'
+ 'A: YAML\n'
+ 'Z: YAML\n')
+ sorted_correctly_files = ws_with_files_in_many_codecs(
+ 'sorted_correctly/{}.yaml',
+ sorted_correctly
+ )
+ sorted_incorrectly = ('---\n'
+ 'Z: YAML\n'
+ 'A: YAML\n')
+ sorted_incorrectly_files = ws_with_files_in_many_codecs(
+ 'sorted_incorrectly/{}.yaml',
+ sorted_incorrectly
+ )
+ workspace = {
+ **config_files,
+ **sorted_correctly_files,
+ **sorted_incorrectly_files
+ }
+
+ with temp_workspace(workspace):
+ for config_path in config_files.keys():
+ with RunContext(self) as ctx:
+ cli.run(('-c', config_path, 'sorted_correctly/'))
+ self.assertEqual(ctx.returncode, 0)
+ with RunContext(self) as ctx:
+ cli.run(('-c', config_path, 'sorted_incorrectly/'))
+ self.assertNotEqual(ctx.returncode, 0)
diff --git a/tests/test_decoder.py b/tests/test_decoder.py
new file mode 100644
index 00000000..3bb5d1af
--- /dev/null
+++ b/tests/test_decoder.py
@@ -0,0 +1,221 @@
+# Copyright (C) 2023 Jason Yundt
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import codecs
+import unittest
+
+from tests.common import (register_test_codecs, unregister_test_codecs,
+ utf_codecs, encoding_detectable, uses_bom,
+ is_test_codec, test_codec_built_in_equivalent)
+
+from yamllint import decoder
+
+
+test_strings = (
+ "",
+ "y",
+ "yaml",
+ "🇾🇦🇲🇱❗"
+)
+setUpModule = register_test_codecs
+tearDownModule = unregister_test_codecs
+
+
+class EncodingStuffFromCommonTestCase(unittest.TestCase):
+ def test_test_codecs_and_utf_codecs(self):
+ error = "{} failed to correctly encode then decode {}."
+ for string in test_strings:
+ for codec in utf_codecs():
+ self.assertEqual(
+ string,
+ string.encode(codec).decode(codec),
+ msg=error.format(repr(codec), repr(string))
+ )
+
+ def test_is_test_codec(self):
+ self.assertFalse(is_test_codec('utf_32'))
+ self.assertFalse(is_test_codec('utf_32_be'))
+ self.assertTrue(is_test_codec('utf_32_be_sig'))
+ self.assertFalse(is_test_codec('utf_32_le'))
+ self.assertTrue(is_test_codec('utf_32_le_sig'))
+
+ self.assertFalse(is_test_codec('utf_16'))
+ self.assertFalse(is_test_codec('utf_16_be'))
+ self.assertTrue(is_test_codec('utf_16_be_sig'))
+ self.assertFalse(is_test_codec('utf_16_le'))
+ self.assertTrue(is_test_codec('utf_16_le_sig'))
+
+ self.assertFalse(is_test_codec('utf_8'))
+ self.assertFalse(is_test_codec('utf_8_be'))
+
+ def test_test_codec_built_in_equivalent(self):
+ self.assertEqual(
+ 'utf_32',
+ test_codec_built_in_equivalent('utf_32_be_sig')
+ )
+ self.assertEqual(
+ 'utf_32',
+ test_codec_built_in_equivalent('utf_32_le_sig')
+ )
+
+ self.assertEqual(
+ 'utf_16',
+ test_codec_built_in_equivalent('utf_16_be_sig')
+ )
+ self.assertEqual(
+ 'utf_16',
+ test_codec_built_in_equivalent('utf_16_le_sig')
+ )
+
+ def test_uses_bom(self):
+ self.assertTrue(uses_bom('utf_32'))
+ self.assertFalse(uses_bom('utf_32_be'))
+ self.assertTrue(uses_bom('utf_32_be_sig'))
+ self.assertFalse(uses_bom('utf_32_le'))
+ self.assertTrue(uses_bom('utf_32_le_sig'))
+
+ self.assertTrue(uses_bom('utf_16'))
+ self.assertFalse(uses_bom('utf_16_be'))
+ self.assertTrue(uses_bom('utf_16_be_sig'))
+ self.assertFalse(uses_bom('utf_16_le'))
+ self.assertTrue(uses_bom('utf_16_le_sig'))
+
+ self.assertFalse(uses_bom('utf_8'))
+ self.assertTrue(uses_bom('utf_8_sig'))
+
+ def test_encoding_detectable(self):
+ # No BOM + nothing
+ self.assertFalse(encoding_detectable('', 'utf_32_be'))
+ self.assertFalse(encoding_detectable('', 'utf_32_le'))
+
+ self.assertFalse(encoding_detectable('', 'utf_16_be'))
+ self.assertFalse(encoding_detectable('', 'utf_16_le'))
+
+ self.assertFalse(encoding_detectable('', 'utf_8'))
+ # BOM + nothing
+ self.assertTrue(encoding_detectable('', 'utf_32'))
+ self.assertTrue(encoding_detectable('', 'utf_32_be_sig'))
+ self.assertTrue(encoding_detectable('', 'utf_32_le_sig'))
+
+ self.assertTrue(encoding_detectable('', 'utf_16'))
+ self.assertTrue(encoding_detectable('', 'utf_16_be_sig'))
+ self.assertTrue(encoding_detectable('', 'utf_16_le_sig'))
+
+ self.assertTrue(encoding_detectable('', 'utf_8_sig'))
+ # No BOM + non-ASCII
+ self.assertFalse(encoding_detectable('Ⓝⓔ', 'utf_32_be'))
+ self.assertFalse(encoding_detectable('ⓥⓔ', 'utf_32_le'))
+
+ self.assertFalse(encoding_detectable('ⓡ ', 'utf_16_be'))
+ self.assertFalse(encoding_detectable('ⓖⓞ', 'utf_16_le'))
+
+ self.assertFalse(encoding_detectable('ⓝⓝ', 'utf_8'))
+ # No BOM + ASCII
+ self.assertTrue(encoding_detectable('a ', 'utf_32_be'))
+ self.assertTrue(encoding_detectable('gi', 'utf_32_le'))
+
+ self.assertTrue(encoding_detectable('ve', 'utf_16_be'))
+ self.assertTrue(encoding_detectable(' y', 'utf_16_le'))
+
+ self.assertTrue(encoding_detectable('ou', 'utf_8'))
+ # BOM + non-ASCII
+ self.assertTrue(encoding_detectable('␣ⓤ', 'utf_32'))
+ self.assertTrue(encoding_detectable('ⓟ', 'utf_32_be_sig'))
+ self.assertTrue(encoding_detectable('Ⓝⓔ', 'utf_32_le_sig'))
+
+ self.assertTrue(encoding_detectable('ⓥⓔ', 'utf_16'))
+ self.assertTrue(encoding_detectable('ⓡ␣', 'utf_16_be_sig'))
+ self.assertTrue(encoding_detectable('ⓖⓞ', 'utf_16_le_sig'))
+
+ self.assertTrue(encoding_detectable('ⓝⓝ', 'utf_8_sig'))
+ # BOM + ASCII
+ self.assertTrue(encoding_detectable('a ', 'utf_32'))
+ self.assertTrue(encoding_detectable('le', 'utf_32_be_sig'))
+ self.assertTrue(encoding_detectable('t ', 'utf_32_le_sig'))
+
+ self.assertTrue(encoding_detectable('yo', 'utf_16'))
+ self.assertTrue(encoding_detectable('u ', 'utf_16_be_sig'))
+ self.assertTrue(encoding_detectable('do', 'utf_16_le_sig'))
+
+ self.assertTrue(encoding_detectable('wn', 'utf_8_sig'))
+
+
+class DecoderTestCase(unittest.TestCase):
+ def test_detect_encoding(self):
+ error1 = "{} was encoded with {}, but detect_encoding() returned {}."
+ error2 = "detect_encoding({}) returned a codec that isn’t built-in."
+ for string in test_strings:
+ for codec in utf_codecs():
+ input = string.encode(codec)
+
+ if not uses_bom(codec) and len(string) == 0:
+ expected_output = 'utf_8'
+ elif not encoding_detectable(string, codec):
+ expected_output = None
+ elif is_test_codec(codec):
+ expected_output = test_codec_built_in_equivalent(codec)
+ else:
+ expected_output = codec
+
+ actual_output = decoder.detect_encoding(input)
+ if expected_output is not None:
+ self.assertEqual(
+ expected_output,
+ actual_output,
+ msg=error1.format(
+ input,
+ repr(codec),
+ repr(actual_output)
+ )
+ )
+
+ codec_info = codecs.lookup(actual_output)
+ self.assertFalse(
+ is_test_codec(codec_info),
+ msg=error2.format(input)
+ )
+
+ def test_auto_decode(self):
+ lenient_error_handlers = (
+ 'ignore',
+ 'replace',
+ 'backslashreplace',
+ 'surrogateescape',
+ )
+ at_least_one_decode_error = False
+ for string in test_strings:
+ for codec in utf_codecs():
+ input = string.encode(codec)
+ if encoding_detectable(string, codec) or len(string) == 0:
+ actual_output = decoder.auto_decode(input)
+ self.assertEqual(
+ string,
+ actual_output,
+ msg=f"auto_decode({input}) returned the wrong value."
+ )
+ self.assertIsInstance(actual_output, str)
+ else:
+ try:
+ decoder.auto_decode(input)
+ except UnicodeDecodeError:
+ at_least_one_decode_error = True
+
+ for handler in lenient_error_handlers:
+ actual_output = decoder.auto_decode(input, errors=handler)
+ self.assertIsInstance(actual_output, str)
+ self.assertTrue(
+ at_least_one_decode_error,
+ msg="None of the test_strings triggered a decoding error."
+ )
diff --git a/yamllint/cli.py b/yamllint/cli.py
index 604e5940..8de3c55a 100644
--- a/yamllint/cli.py
+++ b/yamllint/cli.py
@@ -219,7 +219,7 @@ def run(argv=None):
for file in find_files_recursively(args.files, conf):
filepath = file[2:] if file.startswith('./') else file
try:
- with open(file, newline='') as f:
+ with open(file, mode='rb') as f:
problems = linter.run(f, conf, filepath)
except OSError as e:
print(e, file=sys.stderr)
diff --git a/yamllint/config.py b/yamllint/config.py
index b07229f5..45ea3c3d 100644
--- a/yamllint/config.py
+++ b/yamllint/config.py
@@ -19,6 +19,7 @@
import pathspec
import yaml
+from yamllint import decoder
import yamllint.rules
@@ -38,8 +39,8 @@ def __init__(self, content=None, file=None):
self.locale = None
if file is not None:
- with open(file) as f:
- content = f.read()
+ with open(file, mode='rb') as f:
+ content = decoder.auto_decode(f.read())
self.parse(content)
self.validate()
diff --git a/yamllint/decoder.py b/yamllint/decoder.py
new file mode 100644
index 00000000..47eb9988
--- /dev/null
+++ b/yamllint/decoder.py
@@ -0,0 +1,60 @@
+# Copyright (C) 2023 Jason Yundt
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import codecs
+
+
+def detect_encoding(stream_data):
+ """
+ Return stream_data’s character encoding
+
+ Specifically, this function will take a bytes object and return a string
+ that contains the name of one of Python’s built-in codecs [1].
+
+ The YAML spec says that streams must begin with a BOM or an ASCII
+ character. If stream_data doesn’t begin with either of those, then this
+ function might return the wrong encoding. See chapter 5.2 of the YAML spec
+ for details [2].
+
+ [1]:
+ [2]:
+ """
+ if stream_data.startswith(codecs.BOM_UTF32_BE):
+ return 'utf_32'
+ elif stream_data.startswith(b'\x00\x00\x00') and len(stream_data) >= 4:
+ return 'utf_32_be'
+ elif stream_data.startswith(codecs.BOM_UTF32_LE):
+ return 'utf_32'
+ elif stream_data[1:4] == b'\x00\x00\x00':
+ return 'utf_32_le'
+ elif stream_data.startswith(codecs.BOM_UTF16_BE):
+ return 'utf_16'
+ elif stream_data.startswith(b'\x00') and len(stream_data) >= 2:
+ return 'utf_16_be'
+ elif stream_data.startswith(codecs.BOM_UTF16_LE):
+ return 'utf_16'
+ elif stream_data[1:2] == b'\x00':
+ return 'utf_16_le'
+ elif stream_data.startswith(codecs.BOM_UTF8):
+ return 'utf_8_sig'
+ else:
+ return 'utf_8'
+
+
+def auto_decode(stream_data, errors='strict'):
+ return stream_data.decode(
+ encoding=detect_encoding(stream_data),
+ errors=errors
+ )
diff --git a/yamllint/linter.py b/yamllint/linter.py
index 0de1f716..caf5111e 100644
--- a/yamllint/linter.py
+++ b/yamllint/linter.py
@@ -18,6 +18,7 @@
import yaml
+from yamllint import decoder
from yamllint import parser
@@ -188,6 +189,8 @@ def get_syntax_error(buffer):
def _run(buffer, conf, filepath):
assert hasattr(buffer, '__getitem__'), \
'_run() argument must be a buffer, not a stream'
+ if isinstance(buffer, bytes):
+ buffer = decoder.auto_decode(buffer)
first_line = next(parser.line_generator(buffer)).content
if re.match(r'^#\s*yamllint disable-file\s*$', first_line):