Skip to content

Commit

Permalink
decoder: Autodetect detect encoding of YAML files
Browse files Browse the repository at this point in the history
Before this change, yamllint would open YAML files using open()’s
default encoding. As long as UTF-8 mode isn’t enabled, open() defaults
to using the system’s locale encoding [1][2].

Most of the time, the locale encoding on Linux systems is UTF-8 [3][4],
but it doesn’t have to be [5]. Additionally, the locale encoding on
Windows systems is the system’s ANSI code page [6]. As a result, you
would have to either enable UTF-8 mode, give Python a custom manifest or
enable a beta feature in Windows settings in order to lint UTF-8 YAML
files on Windows [2][7].

Finally, using open()’s default encoding is a violation of the YAML
spec. Chapter 5.2 says:

	“On input, a YAML processor must support the UTF-8 and UTF-16
	character encodings. For JSON compatibility, the UTF-32
	encodings must also be supported.

	If a character stream begins with a byte order mark, the
	character encoding will be taken to be as indicated by the byte
	order mark. Otherwise, the stream must begin with an ASCII
	character. This allows the encoding to be deduced by the pattern
	of null (x00) characters.” [8]

This change fixes all of those problems by implementing the YAML spec’s
character encoding detection algorithm. Now, as long as YAML files
begins with either a byte order mark or an ASCII character, yamllint
will automatically detect them as being UTF-8, UTF-16 or UTF-32. Other
character encodings are not supported at the moment.

Fixes adrienverge#218. Fixes adrienverge#238. Fixes adrienverge#347.

[1]: <https://docs.python.org/3.12/library/functions.html#open>
[2]: <https://docs.python.org/3.12/library/os.html#utf8-mode>
[3]: <https://sourceware.org/glibc/manual/html_node/Extended-Char-Intro.html>
[4]: <https://wiki.musl-libc.org/functional-differences-from-glibc.html#Character-sets-and-locale>
[5]: <https://sourceware.org/git/?p=glibc.git;a=blob;f=localedata/SUPPORTED;h=c8b63cc2fe2b4547f2fb1bff6193da68d70bd563;hb=36f2487f13e3540be9ee0fb51876b1da72176d3f>
[6]: <https://docs.python.org/3.12/glossary.html#term-locale-encoding>
[7]: <https://learn.microsoft.com/en-us/windows/apps/design/globalizing/use-utf8-code-page>
[8]: <https://yaml.org/spec/1.2.2/#52-character-encodings>
  • Loading branch information
Jayman2000 committed Jan 13, 2024
1 parent b6cf981 commit e9f551e
Show file tree
Hide file tree
Showing 7 changed files with 457 additions and 5 deletions.
108 changes: 108 additions & 0 deletions tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,112 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import codecs
import collections
import contextlib
import os
import shutil
import sys
import tempfile
import unittest
import warnings
from codecs import CodecInfo as CI

import yaml

from yamllint import linter
from yamllint.config import YamlLintConfig


# Encoding related stuff:
def encode_utf_32_be_sig(obj, errors='strict'):
return (
codecs.BOM_UTF32_BE + codecs.encode(obj, 'utf_32_be', errors),
len(obj)
)


def encode_utf_32_le_sig(obj, errors='strict'):
return (
codecs.BOM_UTF32_LE + codecs.encode(obj, 'utf_32_le', errors),
len(obj)
)


def encode_utf_16_be_sig(obj, errors='strict'):
return (
codecs.BOM_UTF16_BE + codecs.encode(obj, 'utf_16_be', errors),
len(obj)
)


def encode_utf_16_le_sig(obj, errors='strict'):
return (
codecs.BOM_UTF16_LE + codecs.encode(obj, 'utf_16_le', errors),
len(obj)
)


test_codec_infos = {
'utf_32_be_sig': CI(encode_utf_32_be_sig, codecs.getdecoder('utf_32')),
'utf_32_le_sig': CI(encode_utf_32_le_sig, codecs.getdecoder('utf_32')),
'utf_16_be_sig': CI(encode_utf_16_be_sig, codecs.getdecoder('utf_16')),
'utf_16_le_sig': CI(encode_utf_16_le_sig, codecs.getdecoder('utf_16')),
}


def register_test_codecs():
codecs.register(test_codec_infos.get)


def unregister_test_codecs():
if sys.version_info >= (3, 10, 0):
codecs.unregister(test_codec_infos.get)
else:
warnings.warn(
"This version of Python doesn’t allow us to unregister codecs.",
stacklevel=1
)


def is_test_codec(codec):
return codec in test_codec_infos.keys()


def test_codec_built_in_equivalent(test_codec):
return_value = test_codec
for suffix in ('_sig', '_be', '_le'):
return_value = return_value.replace(suffix, '')
return return_value


def uses_bom(codec):
for suffix in ('_32', '_16', '_sig'):
if codec.endswith(suffix):
return True
return False


def encoding_detectable(string, codec):
"""
Returns True if encoding can be detected after string is encoded
Encoding detection only works if you’re using a BOM or the first character
is ASCII. See yamllint.decoder.auto_decode()’s docstring.
"""
return uses_bom(codec) or (len(string) > 0 and string[0].isascii())


def utf_codecs():
for chunk_size in ('32', '16'):
for endianness in ('be', 'le'):
for sig in ('', '_sig'):
yield f'utf_{chunk_size}_{endianness}{sig}'
yield 'utf_8_sig'
yield 'utf_8'


# Rule related stuff:
class RuleTestCase(unittest.TestCase):
def build_fake_config(self, conf):
if conf is None:
Expand Down Expand Up @@ -54,6 +148,10 @@ def check(self, source, conf, **kwargs):
self.assertEqual(real_problems, expected_problems)


# Workspace related stuff:
Blob = collections.namedtuple('Blob', ('text', 'encoding'))


def build_temp_workspace(files):
tempdir = tempfile.mkdtemp(prefix='yamllint-tests-')

Expand All @@ -65,6 +163,8 @@ def build_temp_workspace(files):
if isinstance(content, list):
os.mkdir(path)
else:
if isinstance(content, Blob):
content = content.text.encode(content.encoding)
mode = 'wb' if isinstance(content, bytes) else 'w'
with open(path, mode) as f:
f.write(content)
Expand All @@ -84,3 +184,11 @@ def temp_workspace(files):
finally:
os.chdir(backup_wd)
shutil.rmtree(wd)


def ws_with_files_in_many_codecs(path_template, text):
workspace = {}
for codec in utf_codecs():
if encoding_detectable(text, codec):
workspace[path_template.format(codec)] = Blob(text, codec)
return workspace
57 changes: 56 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,13 @@
import unittest
from io import StringIO

from tests.common import build_temp_workspace, temp_workspace
from tests.common import (
build_temp_workspace,
register_test_codecs,
temp_workspace,
unregister_test_codecs,
ws_with_files_in_many_codecs,
)

from yamllint import cli, config

Expand Down Expand Up @@ -815,3 +821,52 @@ def test_multiple_parent_config_file(self):
self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr),
(0, './4spaces.yml:2:5: [warning] wrong indentation: '
'expected 3 but found 4 (indentation)\n', ''))


class CommandLineEncodingTestCase(unittest.TestCase):
@classmethod
def setUpClass(cls):
super().setUpClass()
register_test_codecs()

@classmethod
def tearDownClass(cls):
super().tearDownClass()
unregister_test_codecs()

def test_valid_encodings(self):
conf = ('---\n'
'rules:\n'
' key-ordering: enable\n')
config_files = ws_with_files_in_many_codecs(
'config_{}.yaml',
conf
)
sorted_correctly = ('---\n'
'A: YAML\n'
'Z: YAML\n')
sorted_correctly_files = ws_with_files_in_many_codecs(
'sorted_correctly/{}.yaml',
sorted_correctly
)
sorted_incorrectly = ('---\n'
'Z: YAML\n'
'A: YAML\n')
sorted_incorrectly_files = ws_with_files_in_many_codecs(
'sorted_incorrectly/{}.yaml',
sorted_incorrectly
)
workspace = {
**config_files,
**sorted_correctly_files,
**sorted_incorrectly_files
}

with temp_workspace(workspace):
for config_path in config_files.keys():
with RunContext(self) as ctx:
cli.run(('-c', config_path, 'sorted_correctly/'))
self.assertEqual(ctx.returncode, 0)
with RunContext(self) as ctx:
cli.run(('-c', config_path, 'sorted_incorrectly/'))
self.assertNotEqual(ctx.returncode, 0)
Loading

0 comments on commit e9f551e

Please sign in to comment.