Skip to content

Commit

Permalink
Set encoding for tar file and use unicode path for unpacking
Browse files Browse the repository at this point in the history
When tarfile.TarFile decodes filenames in Python 2.7 by default it uses
sys.getfilesystemencoding. On Windows this returns "mbcs", which is
lossy when converting from proper utf-8 to bytes (results in '?' for out
of range characters).

We now pass an encoding to tarfile.open which will be used instead.
Since the encoding argument is only ever used for the PAX format, and
since the PAX format guarantees utf-8 encoded information, this should
work in all circumstances.

For filesystem APIs in Python 2, the type of the path object passed
dictates the underlying Windows API that is called. For `str` it is the
`*A` (for ANSI) APIs. For `unicode` it is the `*W` (for Wide character)
APIs. To use the second set of APIs, which properly handles unicode
filenames, we try to convert the byte path to utf-8. Since there is no
obvious way to identify a "PAX" tar file or tar info entity, we
optimistically try to do the conversion and silently continue if it
fails.
  • Loading branch information
chrahunt committed Jan 29, 2020
1 parent 153e22a commit 85f95d7
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 2 deletions.
1 change: 1 addition & 0 deletions news/7667.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix extraction of files with utf-8 encoded paths from tars.
15 changes: 13 additions & 2 deletions src/pip/_internal/utils/unpacking.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
import tarfile
import zipfile

from pip._vendor.six import ensure_text

from pip._internal.exceptions import InstallationError
from pip._internal.utils.filetypes import (
BZ2_EXTENSIONS,
Expand Down Expand Up @@ -174,7 +176,7 @@ def untar_file(filename, location):
'Cannot determine compression type for file %s', filename,
)
mode = 'r:*'
tar = tarfile.open(filename, mode)
tar = tarfile.open(filename, mode, encoding="utf-8")
try:
leading = has_leading_dir([
member.name for member in tar.getmembers()
Expand All @@ -184,7 +186,16 @@ def untar_file(filename, location):
if leading:
# https://github.com/python/mypy/issues/1174
fn = split_leading_dir(fn)[1] # type: ignore
path = os.path.join(location, fn)

path = os.path.join(location, fn) # type: AnyStr
try:
# Convert path to text so all APIs below work with unicode files.
path = ensure_text(path)
except UnicodeDecodeError:
# Silently continue, for backwards compatibility on non-PAX
# archives.
pass

if not is_within_directory(location, path):
message = (
'The tar file ({}) has a file ({}) trying to install '
Expand Down
21 changes: 21 additions & 0 deletions tests/unit/test_utils_unpacking.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
import os
import shutil
import stat
Expand All @@ -8,6 +9,7 @@
import zipfile

import pytest
from pip._vendor.six import ensure_str, ensure_text

from pip._internal.exceptions import InstallationError
from pip._internal.utils.unpacking import (
Expand Down Expand Up @@ -172,6 +174,25 @@ def test_unpack_tar_success(self):
untar_file(test_tar, self.tempdir)


def test_unpack_tar_unicode(tmpdir):
test_tar = tmpdir / "test.tar"
# tarfile tries to decode incoming
with tarfile.open(
test_tar, "w", format=tarfile.PAX_FORMAT, encoding="utf-8"
) as f:
metadata = tarfile.TarInfo(ensure_str(u"dir/åäö_日本語.py"))
f.addfile(metadata, "hello world")

output_dir = tmpdir / "output"
output_dir.mkdir()

untar_file(test_tar, str(output_dir))

output_dir_name = ensure_text(str(output_dir))
contents = os.listdir(output_dir_name)
assert u"åäö_日本語.py" in contents


@pytest.mark.parametrize('args, expected', [
# Test the second containing the first.
(('parent/sub', 'parent/'), False),
Expand Down

0 comments on commit 85f95d7

Please sign in to comment.