diff --git a/tests/common.py b/tests/common.py index 78bb9cbf..579e853f 100644 --- a/tests/common.py +++ b/tests/common.py @@ -13,11 +13,16 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import codecs +from codecs import CodecInfo as CI +import collections import contextlib import os import shutil +import sys import tempfile import unittest +import warnings import yaml @@ -25,6 +30,7 @@ from yamllint import linter +# Rule related stuff: class RuleTestCase(unittest.TestCase): def build_fake_config(self, conf): if conf is None: @@ -54,6 +60,10 @@ def check(self, source, conf, **kwargs): self.assertEqual(real_problems, expected_problems) +# Workspace related stuff: +Blob = collections.namedtuple('Blob', ('text', 'encoding')) + + def build_temp_workspace(files): tempdir = tempfile.mkdtemp(prefix='yamllint-tests-') @@ -65,6 +75,8 @@ def build_temp_workspace(files): if type(content) is list: os.mkdir(path) else: + if isinstance(content, Blob): + content = content.text.encode(content.encoding) mode = 'wb' if isinstance(content, bytes) else 'w' with open(path, mode) as f: f.write(content) @@ -84,3 +96,98 @@ def temp_workspace(files): finally: os.chdir(backup_wd) shutil.rmtree(wd) + + +# Encoding related stuff: +def encode_utf_32_be_sig(obj, errors='strict'): + return ( + codecs.BOM_UTF32_BE + codecs.encode(obj, 'utf_32_be', errors), + len(obj) + ) + + +def encode_utf_32_le_sig(obj, errors='strict'): + return ( + codecs.BOM_UTF32_LE + codecs.encode(obj, 'utf_32_le', errors), + len(obj) + ) + + +def encode_utf_16_be_sig(obj, errors='strict'): + return ( + codecs.BOM_UTF16_BE + codecs.encode(obj, 'utf_16_be', errors), + len(obj) + ) + + +def encode_utf_16_le_sig(obj, errors='strict'): + return ( + codecs.BOM_UTF16_LE + codecs.encode(obj, 'utf_16_le', errors), + len(obj) + ) + + +test_codec_infos = { + 'utf_32_be_sig': CI(encode_utf_32_be_sig, codecs.getdecoder('utf_32')), + 'utf_32_le_sig': CI(encode_utf_32_le_sig, codecs.getdecoder('utf_32')), + 'utf_16_be_sig': CI(encode_utf_16_be_sig, codecs.getdecoder('utf_16')), + 'utf_16_le_sig': CI(encode_utf_16_le_sig, codecs.getdecoder('utf_16')), +} + + +def register_test_codecs(): + codecs.register(test_codec_infos.get) + + +def unregister_test_codecs(): + if sys.version_info >= (3, 10, 0): + codecs.unregister(test_codec_infos.get) + else: + warnings.warn( + "This version of Python doesn’t allow us to unregister codecs." + ) + + +def is_test_codec(codec): + return codec in test_codec_infos.keys() + + +def test_codec_built_in_equivalent(test_codec): + return_value = test_codec + for suffix in ('_sig', '_be', '_le'): + return_value = return_value.replace(suffix, '') + return return_value + + +def uses_bom(codec): + for suffix in ('_32', '_16', '_sig'): + if codec.endswith(suffix): + return True + return False + + +def encoding_detectable(string, codec): + """ + Returns True if encoding can be detected after string is encoded + + Encoding detection only works if you’re using a BOM or the first character + is ASCII. See yamllint.decoder.auto_decode()’s docstring. + """ + return uses_bom(codec) or (len(string) > 0 and string[0].isascii()) + + +def utf_codecs(): + for chunk_size in ('32', '16'): + for endianness in ('be', 'le'): + for sig in ('', '_sig'): + yield f'utf_{chunk_size}_{endianness}{sig}' + yield 'utf_8_sig' + yield 'utf_8' + + +def ws_with_files_in_many_codecs(path_template, text): + workspace = {} + for codec in utf_codecs(): + if encoding_detectable(text, codec): + workspace[path_template.format(codec)] = Blob(text, codec) + return workspace diff --git a/tests/test_cli.py b/tests/test_cli.py index d158e326..3922eb5e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -23,7 +23,9 @@ import tempfile import unittest -from tests.common import build_temp_workspace, temp_workspace +from tests.common import (build_temp_workspace, temp_workspace, + ws_with_files_in_many_codecs, + register_test_codecs, unregister_test_codecs) from yamllint import cli from yamllint import config @@ -797,3 +799,52 @@ def test_multiple_parent_config_file(self): self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, './4spaces.yml:2:5: [warning] wrong indentation: ' 'expected 3 but found 4 (indentation)\n', '')) + + +class CommandLineEncodingTestCase(unittest.TestCase): + @classmethod + def setUpClass(cls): + super().setUpClass() + register_test_codecs() + + @classmethod + def tearDownClass(cls): + super().tearDownClass() + unregister_test_codecs() + + def test_valid_encodings(self): + conf = ('---\n' + 'rules:\n' + ' key-ordering: enable\n') + config_files = ws_with_files_in_many_codecs( + 'config_{}.yaml', + conf + ) + sorted_correctly = ('---\n' + 'A: YAML\n' + 'Z: YAML\n') + sorted_correctly_files = ws_with_files_in_many_codecs( + 'sorted_correctly/{}.yaml', + sorted_correctly + ) + sorted_incorrectly = ('---\n' + 'Z: YAML\n' + 'A: YAML\n') + sorted_incorrectly_files = ws_with_files_in_many_codecs( + 'sorted_incorrectly/{}.yaml', + sorted_incorrectly + ) + workspace = { + **config_files, + **sorted_correctly_files, + **sorted_incorrectly_files + } + + with temp_workspace(workspace): + for config_path in config_files.keys(): + with RunContext(self) as ctx: + cli.run(('-c', config_path, 'sorted_correctly/')) + self.assertEqual(ctx.returncode, 0) + with RunContext(self) as ctx: + cli.run(('-c', config_path, 'sorted_incorrectly/')) + self.assertNotEqual(ctx.returncode, 0) diff --git a/tests/test_decoder.py b/tests/test_decoder.py new file mode 100644 index 00000000..3bb5d1af --- /dev/null +++ b/tests/test_decoder.py @@ -0,0 +1,221 @@ +# Copyright (C) 2023 Jason Yundt +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import codecs +import unittest + +from tests.common import (register_test_codecs, unregister_test_codecs, + utf_codecs, encoding_detectable, uses_bom, + is_test_codec, test_codec_built_in_equivalent) + +from yamllint import decoder + + +test_strings = ( + "", + "y", + "yaml", + "🇾⁠🇦⁠🇲⁠🇱⁠❗" +) +setUpModule = register_test_codecs +tearDownModule = unregister_test_codecs + + +class EncodingStuffFromCommonTestCase(unittest.TestCase): + def test_test_codecs_and_utf_codecs(self): + error = "{} failed to correctly encode then decode {}." + for string in test_strings: + for codec in utf_codecs(): + self.assertEqual( + string, + string.encode(codec).decode(codec), + msg=error.format(repr(codec), repr(string)) + ) + + def test_is_test_codec(self): + self.assertFalse(is_test_codec('utf_32')) + self.assertFalse(is_test_codec('utf_32_be')) + self.assertTrue(is_test_codec('utf_32_be_sig')) + self.assertFalse(is_test_codec('utf_32_le')) + self.assertTrue(is_test_codec('utf_32_le_sig')) + + self.assertFalse(is_test_codec('utf_16')) + self.assertFalse(is_test_codec('utf_16_be')) + self.assertTrue(is_test_codec('utf_16_be_sig')) + self.assertFalse(is_test_codec('utf_16_le')) + self.assertTrue(is_test_codec('utf_16_le_sig')) + + self.assertFalse(is_test_codec('utf_8')) + self.assertFalse(is_test_codec('utf_8_be')) + + def test_test_codec_built_in_equivalent(self): + self.assertEqual( + 'utf_32', + test_codec_built_in_equivalent('utf_32_be_sig') + ) + self.assertEqual( + 'utf_32', + test_codec_built_in_equivalent('utf_32_le_sig') + ) + + self.assertEqual( + 'utf_16', + test_codec_built_in_equivalent('utf_16_be_sig') + ) + self.assertEqual( + 'utf_16', + test_codec_built_in_equivalent('utf_16_le_sig') + ) + + def test_uses_bom(self): + self.assertTrue(uses_bom('utf_32')) + self.assertFalse(uses_bom('utf_32_be')) + self.assertTrue(uses_bom('utf_32_be_sig')) + self.assertFalse(uses_bom('utf_32_le')) + self.assertTrue(uses_bom('utf_32_le_sig')) + + self.assertTrue(uses_bom('utf_16')) + self.assertFalse(uses_bom('utf_16_be')) + self.assertTrue(uses_bom('utf_16_be_sig')) + self.assertFalse(uses_bom('utf_16_le')) + self.assertTrue(uses_bom('utf_16_le_sig')) + + self.assertFalse(uses_bom('utf_8')) + self.assertTrue(uses_bom('utf_8_sig')) + + def test_encoding_detectable(self): + # No BOM + nothing + self.assertFalse(encoding_detectable('', 'utf_32_be')) + self.assertFalse(encoding_detectable('', 'utf_32_le')) + + self.assertFalse(encoding_detectable('', 'utf_16_be')) + self.assertFalse(encoding_detectable('', 'utf_16_le')) + + self.assertFalse(encoding_detectable('', 'utf_8')) + # BOM + nothing + self.assertTrue(encoding_detectable('', 'utf_32')) + self.assertTrue(encoding_detectable('', 'utf_32_be_sig')) + self.assertTrue(encoding_detectable('', 'utf_32_le_sig')) + + self.assertTrue(encoding_detectable('', 'utf_16')) + self.assertTrue(encoding_detectable('', 'utf_16_be_sig')) + self.assertTrue(encoding_detectable('', 'utf_16_le_sig')) + + self.assertTrue(encoding_detectable('', 'utf_8_sig')) + # No BOM + non-ASCII + self.assertFalse(encoding_detectable('Ⓝⓔ', 'utf_32_be')) + self.assertFalse(encoding_detectable('ⓥⓔ', 'utf_32_le')) + + self.assertFalse(encoding_detectable('ⓡ ', 'utf_16_be')) + self.assertFalse(encoding_detectable('ⓖⓞ', 'utf_16_le')) + + self.assertFalse(encoding_detectable('ⓝⓝ', 'utf_8')) + # No BOM + ASCII + self.assertTrue(encoding_detectable('a ', 'utf_32_be')) + self.assertTrue(encoding_detectable('gi', 'utf_32_le')) + + self.assertTrue(encoding_detectable('ve', 'utf_16_be')) + self.assertTrue(encoding_detectable(' y', 'utf_16_le')) + + self.assertTrue(encoding_detectable('ou', 'utf_8')) + # BOM + non-ASCII + self.assertTrue(encoding_detectable('␣ⓤ', 'utf_32')) + self.assertTrue(encoding_detectable('ⓟ␤', 'utf_32_be_sig')) + self.assertTrue(encoding_detectable('Ⓝⓔ', 'utf_32_le_sig')) + + self.assertTrue(encoding_detectable('ⓥⓔ', 'utf_16')) + self.assertTrue(encoding_detectable('ⓡ␣', 'utf_16_be_sig')) + self.assertTrue(encoding_detectable('ⓖⓞ', 'utf_16_le_sig')) + + self.assertTrue(encoding_detectable('ⓝⓝ', 'utf_8_sig')) + # BOM + ASCII + self.assertTrue(encoding_detectable('a ', 'utf_32')) + self.assertTrue(encoding_detectable('le', 'utf_32_be_sig')) + self.assertTrue(encoding_detectable('t ', 'utf_32_le_sig')) + + self.assertTrue(encoding_detectable('yo', 'utf_16')) + self.assertTrue(encoding_detectable('u ', 'utf_16_be_sig')) + self.assertTrue(encoding_detectable('do', 'utf_16_le_sig')) + + self.assertTrue(encoding_detectable('wn', 'utf_8_sig')) + + +class DecoderTestCase(unittest.TestCase): + def test_detect_encoding(self): + error1 = "{} was encoded with {}, but detect_encoding() returned {}." + error2 = "detect_encoding({}) returned a codec that isn’t built-in." + for string in test_strings: + for codec in utf_codecs(): + input = string.encode(codec) + + if not uses_bom(codec) and len(string) == 0: + expected_output = 'utf_8' + elif not encoding_detectable(string, codec): + expected_output = None + elif is_test_codec(codec): + expected_output = test_codec_built_in_equivalent(codec) + else: + expected_output = codec + + actual_output = decoder.detect_encoding(input) + if expected_output is not None: + self.assertEqual( + expected_output, + actual_output, + msg=error1.format( + input, + repr(codec), + repr(actual_output) + ) + ) + + codec_info = codecs.lookup(actual_output) + self.assertFalse( + is_test_codec(codec_info), + msg=error2.format(input) + ) + + def test_auto_decode(self): + lenient_error_handlers = ( + 'ignore', + 'replace', + 'backslashreplace', + 'surrogateescape', + ) + at_least_one_decode_error = False + for string in test_strings: + for codec in utf_codecs(): + input = string.encode(codec) + if encoding_detectable(string, codec) or len(string) == 0: + actual_output = decoder.auto_decode(input) + self.assertEqual( + string, + actual_output, + msg=f"auto_decode({input}) returned the wrong value." + ) + self.assertIsInstance(actual_output, str) + else: + try: + decoder.auto_decode(input) + except UnicodeDecodeError: + at_least_one_decode_error = True + + for handler in lenient_error_handlers: + actual_output = decoder.auto_decode(input, errors=handler) + self.assertIsInstance(actual_output, str) + self.assertTrue( + at_least_one_decode_error, + msg="None of the test_strings triggered a decoding error." + ) diff --git a/yamllint/cli.py b/yamllint/cli.py index 604e5940..8de3c55a 100644 --- a/yamllint/cli.py +++ b/yamllint/cli.py @@ -219,7 +219,7 @@ def run(argv=None): for file in find_files_recursively(args.files, conf): filepath = file[2:] if file.startswith('./') else file try: - with open(file, newline='') as f: + with open(file, mode='rb') as f: problems = linter.run(f, conf, filepath) except OSError as e: print(e, file=sys.stderr) diff --git a/yamllint/config.py b/yamllint/config.py index b07229f5..45ea3c3d 100644 --- a/yamllint/config.py +++ b/yamllint/config.py @@ -19,6 +19,7 @@ import pathspec import yaml +from yamllint import decoder import yamllint.rules @@ -38,8 +39,8 @@ def __init__(self, content=None, file=None): self.locale = None if file is not None: - with open(file) as f: - content = f.read() + with open(file, mode='rb') as f: + content = decoder.auto_decode(f.read()) self.parse(content) self.validate() diff --git a/yamllint/decoder.py b/yamllint/decoder.py new file mode 100644 index 00000000..47eb9988 --- /dev/null +++ b/yamllint/decoder.py @@ -0,0 +1,60 @@ +# Copyright (C) 2023 Jason Yundt +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import codecs + + +def detect_encoding(stream_data): + """ + Return stream_data’s character encoding + + Specifically, this function will take a bytes object and return a string + that contains the name of one of Python’s built-in codecs [1]. + + The YAML spec says that streams must begin with a BOM or an ASCII + character. If stream_data doesn’t begin with either of those, then this + function might return the wrong encoding. See chapter 5.2 of the YAML spec + for details [2]. + + [1]: + [2]: + """ + if stream_data.startswith(codecs.BOM_UTF32_BE): + return 'utf_32' + elif stream_data.startswith(b'\x00\x00\x00') and len(stream_data) >= 4: + return 'utf_32_be' + elif stream_data.startswith(codecs.BOM_UTF32_LE): + return 'utf_32' + elif stream_data[1:4] == b'\x00\x00\x00': + return 'utf_32_le' + elif stream_data.startswith(codecs.BOM_UTF16_BE): + return 'utf_16' + elif stream_data.startswith(b'\x00') and len(stream_data) >= 2: + return 'utf_16_be' + elif stream_data.startswith(codecs.BOM_UTF16_LE): + return 'utf_16' + elif stream_data[1:2] == b'\x00': + return 'utf_16_le' + elif stream_data.startswith(codecs.BOM_UTF8): + return 'utf_8_sig' + else: + return 'utf_8' + + +def auto_decode(stream_data, errors='strict'): + return stream_data.decode( + encoding=detect_encoding(stream_data), + errors=errors + ) diff --git a/yamllint/linter.py b/yamllint/linter.py index 0de1f716..caf5111e 100644 --- a/yamllint/linter.py +++ b/yamllint/linter.py @@ -18,6 +18,7 @@ import yaml +from yamllint import decoder from yamllint import parser @@ -188,6 +189,8 @@ def get_syntax_error(buffer): def _run(buffer, conf, filepath): assert hasattr(buffer, '__getitem__'), \ '_run() argument must be a buffer, not a stream' + if isinstance(buffer, bytes): + buffer = decoder.auto_decode(buffer) first_line = next(parser.line_generator(buffer)).content if re.match(r'^#\s*yamllint disable-file\s*$', first_line):