diff --git a/news/7667.bugfix b/news/7667.bugfix new file mode 100644 index 00000000000..e42e128e97e --- /dev/null +++ b/news/7667.bugfix @@ -0,0 +1 @@ +Fix extraction of files with utf-8 encoded paths from tars. diff --git a/src/pip/_internal/utils/unpacking.py b/src/pip/_internal/utils/unpacking.py index 7252dc217bf..7104753ad99 100644 --- a/src/pip/_internal/utils/unpacking.py +++ b/src/pip/_internal/utils/unpacking.py @@ -14,6 +14,8 @@ import tarfile import zipfile +from pip._vendor.six import ensure_text + from pip._internal.exceptions import InstallationError from pip._internal.utils.filetypes import ( BZ2_EXTENSIONS, @@ -174,7 +176,7 @@ def untar_file(filename, location): 'Cannot determine compression type for file %s', filename, ) mode = 'r:*' - tar = tarfile.open(filename, mode) + tar = tarfile.open(filename, mode, encoding="utf-8") try: leading = has_leading_dir([ member.name for member in tar.getmembers() @@ -184,7 +186,16 @@ def untar_file(filename, location): if leading: # https://github.com/python/mypy/issues/1174 fn = split_leading_dir(fn)[1] # type: ignore - path = os.path.join(location, fn) + + path = os.path.join(location, fn) # type: AnyStr + try: + # Convert path to text so all APIs below work with unicode files. + path = ensure_text(path) + except UnicodeDecodeError: + # Silently continue, for backwards compatibility on non-PAX + # archives. + pass + if not is_within_directory(location, path): message = ( 'The tar file ({}) has a file ({}) trying to install ' diff --git a/tests/unit/test_utils_unpacking.py b/tests/unit/test_utils_unpacking.py index af9ae1c0e71..c6d6f73142b 100644 --- a/tests/unit/test_utils_unpacking.py +++ b/tests/unit/test_utils_unpacking.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- import os import shutil import stat @@ -8,6 +9,7 @@ import zipfile import pytest +from pip._vendor.six import ensure_str, ensure_text from pip._internal.exceptions import InstallationError from pip._internal.utils.unpacking import ( @@ -172,6 +174,25 @@ def test_unpack_tar_success(self): untar_file(test_tar, self.tempdir) +def test_unpack_tar_unicode(tmpdir): + test_tar = tmpdir / "test.tar" + # tarfile tries to decode incoming + with tarfile.open( + test_tar, "w", format=tarfile.PAX_FORMAT, encoding="utf-8" + ) as f: + metadata = tarfile.TarInfo(ensure_str(u"dir/åäö_日本語.py")) + f.addfile(metadata, "hello world") + + output_dir = tmpdir / "output" + output_dir.mkdir() + + untar_file(test_tar, str(output_dir)) + + output_dir_name = ensure_text(str(output_dir)) + contents = os.listdir(output_dir_name) + assert u"åäö_日本語.py" in contents + + @pytest.mark.parametrize('args, expected', [ # Test the second containing the first. (('parent/sub', 'parent/'), False),