From 3210cfdd1dab3e62b79df56f78317d3fad6405ec Mon Sep 17 00:00:00 2001 From: John Sirois Date: Sun, 3 Nov 2024 12:27:18 -0800 Subject: [PATCH] Tighten up user code handling. Now only user code is extracted, and only if there is any. --- pex/hashing.py | 7 +-- pex/layout.py | 50 +++++++++++++------ pex/pex_builder.py | 6 ++- pex/pip/vcs.py | 6 ++- pex/util.py | 22 ++++++-- .../cli/commands/test_cache_prune.py | 4 +- 6 files changed, 69 insertions(+), 26 deletions(-) diff --git a/pex/hashing.py b/pex/hashing.py index 74712878d..784d62176 100644 --- a/pex/hashing.py +++ b/pex/hashing.py @@ -200,10 +200,11 @@ def dir_hash( def iter_files(): # type: () -> Iterator[Text] for root, dirs, files in os.walk(top, followlinks=True): - dirs[:] = [d for d in dirs if dir_filter(d)] + dirs[:] = [d for d in dirs if dir_filter(os.path.join(root, d))] for f in files: - if file_filter(f): - yield os.path.join(root, f) + path = os.path.join(root, f) + if file_filter(path): + yield path file_paths = sorted(iter_files()) diff --git a/pex/layout.py b/pex/layout.py index b080eee5d..514e0e205 100644 --- a/pex/layout.py +++ b/pex/layout.py @@ -147,6 +147,11 @@ def extract_main(self, dest_dir): # type: (str) -> None raise NotImplementedError() + @abstractmethod + def extract_import_hook(self, dest_dir): + # type: (str) -> None + raise NotImplementedError() + def record(self, dest_dir): # type: (str) -> None self._layout.record(dest_dir) @@ -334,11 +339,6 @@ def _ensure_installed( bootstrap_cache = BootstrapDir.create( pex_info.bootstrap_hash, pex_root=pex_info.pex_root ) - if pex_info.code_hash is None: - raise AssertionError( - "Expected code_hash to be populated for {}.".format(layout) - ) - code_cache = UserCodeDir.create(pex_info.code_hash, pex_root=pex_info.pex_root) with atomic_directory( bootstrap_cache, source=layout.bootstrap_strip_prefix() @@ -357,14 +357,18 @@ def _ensure_installed( install_to=install_to, ) - with atomic_directory(code_cache) as code_chroot: - if not code_chroot.is_finalized(): - layout.extract_code(code_chroot.work_dir) - for path in os.listdir(code_cache): - os.symlink( - os.path.join(os.path.relpath(code_cache, install_to), path), - os.path.join(chroot.work_dir, path), + if pex_info.code_hash: + code_cache = UserCodeDir.create( + pex_info.code_hash, pex_root=pex_info.pex_root ) + with atomic_directory(code_cache) as code_chroot: + if not code_chroot.is_finalized(): + layout.extract_code(code_chroot.work_dir) + for path in os.listdir(code_cache): + os.symlink( + os.path.join(os.path.relpath(code_cache, install_to), path), + os.path.join(chroot.work_dir, path), + ) layout.extract_pex_info(chroot.work_dir) layout.extract_main(chroot.work_dir) @@ -485,7 +489,7 @@ def extract_code(self, dest_dir): # type: (str) -> None for name in self.names: if name not in ("__main__.py", PEX_INFO_PATH) and not name.startswith( - (BOOTSTRAP_DIR, DEPS_DIR) + ("__pex__", BOOTSTRAP_DIR, DEPS_DIR) ): self.zfp.extract(name, dest_dir) @@ -497,6 +501,10 @@ def extract_main(self, dest_dir): # type: (str) -> None self.zfp.extract("__main__.py", dest_dir) + def extract_import_hook(self, dest_dir): + # type: (str) -> None + self.zfp.extract("__pex__/__init__.py", dest_dir) + def __str__(self): return "PEX zipfile {}".format(self.path) @@ -546,7 +554,7 @@ def extract_code(self, dest_dir): for root, dirs, files in os.walk(self._path): rel_root = os.path.relpath(root, self._path) if root == self._path: - dirs[:] = [d for d in dirs if d != DEPS_DIR] + dirs[:] = [d for d in dirs if d not in ("__pex__", DEPS_DIR)] files[:] = [ f for f in files if f not in ("__main__.py", PEX_INFO_PATH, BOOTSTRAP_DIR) ] @@ -566,6 +574,12 @@ def extract_main(self, dest_dir): # type: (str) -> None safe_copy(os.path.join(self._path, "__main__.py"), os.path.join(dest_dir, "__main__.py")) + def extract_import_hook(self, dest_dir): + # type: (str) -> None + dest = os.path.join(dest_dir, "__pex__", "__init__.py") + safe_mkdir(os.path.dirname(dest)) + safe_copy(os.path.join(self._path, "__pex__", "__init__.py"), dest) + def __str__(self): return "Spread PEX directory {}".format(self._path) @@ -617,7 +631,7 @@ def extract_code(self, dest_dir): for root, dirs, files in os.walk(self._path): rel_root = os.path.relpath(root, self._path) if root == self._path: - dirs[:] = [d for d in dirs if d not in (DEPS_DIR, BOOTSTRAP_DIR)] + dirs[:] = [d for d in dirs if d not in ("__pex__", DEPS_DIR, BOOTSTRAP_DIR)] files[:] = [f for f in files if f not in ("__main__.py", PEX_INFO_PATH)] for d in dirs: safe_mkdir(os.path.join(dest_dir, rel_root, d)) @@ -635,6 +649,12 @@ def extract_main(self, dest_dir): # type: (str) -> None safe_copy(os.path.join(self._path, "__main__.py"), os.path.join(dest_dir, "__main__.py")) + def extract_import_hook(self, dest_dir): + # type: (str) -> None + dest = os.path.join(dest_dir, "__pex__", "__init__.py") + safe_mkdir(os.path.dirname(dest)) + safe_copy(os.path.join(self._path, "__pex__", "__init__.py"), dest) + def __str__(self): return "Loose PEX directory {}".format(self._path) diff --git a/pex/pex_builder.py b/pex/pex_builder.py index 7bb6efcea..ceddf893b 100644 --- a/pex/pex_builder.py +++ b/pex/pex_builder.py @@ -461,8 +461,12 @@ def _precompile_source(self): self._chroot.touch(compiled, label="bytecode") def _prepare_code(self): + chroot_path = self._chroot.path() self._pex_info.code_hash = CacheHelper.pex_code_hash( - self._chroot.path(), exclude_dirs=(layout.BOOTSTRAP_DIR, layout.DEPS_DIR) + chroot_path, + exclude_dirs=tuple( + os.path.join(chroot_path, d) for d in (layout.BOOTSTRAP_DIR, layout.DEPS_DIR) + ), ) self._pex_info.pex_hash = hashlib.sha1(self._pex_info.dump().encode("utf-8")).hexdigest() self._chroot.write(self._pex_info.dump().encode("utf-8"), PexInfo.PATH, label="manifest") diff --git a/pex/pip/vcs.py b/pex/pip/vcs.py index b5a5260db..e954590f3 100644 --- a/pex/pip/vcs.py +++ b/pex/pip/vcs.py @@ -102,6 +102,10 @@ def digest_vcs_archive( hashing.dir_hash( directory=chroot, digest=digest, - dir_filter=lambda dir_path: not is_pyc_dir(dir_path) and dir_path != vcs_control_dir, + dir_filter=( + lambda dir_path: ( + not is_pyc_dir(dir_path) and os.path.basename(dir_path) != vcs_control_dir + ) + ), file_filter=lambda f: not is_pyc_file(f), ) diff --git a/pex/util.py b/pex/util.py index d10c4476c..94d51ea9f 100644 --- a/pex/util.py +++ b/pex/util.py @@ -70,6 +70,8 @@ def access_zipped_assets(cls, static_module_name, static_path, dir_location=None class CacheHelper(object): + _EMPTY_CODE_HASH = hashlib.sha1().hexdigest() + @classmethod def hash(cls, path, digest=None, hasher=sha1): # type: (Text, Optional[Hasher], Callable[[], Hasher]) -> str @@ -84,18 +86,28 @@ def pex_code_hash( cls, directory, exclude_dirs=(), # type: Container[str] + exclude_files=(), # type: Container[str] ): - # type: (...) -> str - """Return a reproducible hash of the contents of a loose PEX; excluding all `.pyc` files.""" + # type: (...) -> Optional[str] + """Return a reproducible hash of the user code of a loose PEX; excluding all `.pyc` files. + + If no code is found, `None` is returned. + """ digest = hashlib.sha1() hashing.dir_hash( directory=directory, digest=digest, dir_filter=lambda d: not is_pyc_dir(d) and d not in exclude_dirs, - file_filter=lambda file_path: not is_pyc_file(file_path) - and not file_path.startswith("."), + file_filter=( + lambda f: ( + not is_pyc_file(f) + and not os.path.basename(f).startswith(".") + and f not in exclude_files + ) + ), ) - return digest.hexdigest() + code_hash = digest.hexdigest() + return None if code_hash == cls._EMPTY_CODE_HASH else code_hash @classmethod def dir_hash(cls, directory, digest=None, hasher=sha1): diff --git a/tests/integration/cli/commands/test_cache_prune.py b/tests/integration/cli/commands/test_cache_prune.py index 2af82b71d..cb7112093 100644 --- a/tests/integration/cli/commands/test_cache_prune.py +++ b/tests/integration/cli/commands/test_cache_prune.py @@ -106,7 +106,9 @@ def test_installed_wheel_prune_run_time( assert DiskUsage.collect(CacheDir.INSTALLED_WHEELS.path()).size > 0 assert DiskUsage.collect(CacheDir.UNZIPPED_PEXES.path()).size > 0 assert DiskUsage.collect(CacheDir.BOOTSTRAPS.path()).size > 0 - assert DiskUsage.collect(CacheDir.USER_CODE.path()).size > 0 + assert ( + 0 == DiskUsage.collect(CacheDir.USER_CODE.path()).size + ), "There is no user code in the PEX." assert ( pre_prune_du.size > pex_size ), "Expected the unzipped PEX to be larger than the zipped pex."