diff --git a/conda_build/inspect_pkg.py b/conda_build/inspect_pkg.py index f8fa57a1b5..87bc372766 100644 --- a/conda_build/inspect_pkg.py +++ b/conda_build/inspect_pkg.py @@ -43,13 +43,13 @@ ) from .deprecations import deprecated -from .utils import on_mac, on_win, samefile +from .utils import on_mac, on_win @deprecated("3.28.0", "24.1.0") @lru_cache(maxsize=None) def dist_files(prefix: str | os.PathLike | Path, dist: Dist) -> set[str]: - if (prec := PrefixData(prefix).get(dist.name, None)) is None: + if (prec := PrefixData(str(prefix)).get(dist.name, None)) is None: return set() elif MatchSpec(dist).match(prec): return set(prec["files"]) @@ -62,19 +62,48 @@ def which_package( path: str | os.PathLike | Path, prefix: str | os.PathLike | Path, ) -> Iterable[PrefixRecord]: - """ + """Detect which package(s) a path belongs to. + Given the path (of a (presumably) conda installed file) iterate over the conda packages the file came from. Usually the iteration yields only one package. + + We use lstat since a symlink doesn't clobber the file it points to. """ prefix = Path(prefix) - # historically, path was relative to prefix just to be safe we append to prefix - # (pathlib correctly handles this even if path is absolute) - path = prefix / path + # historically, path was relative to prefix, just to be safe we append to prefix + # get lstat before calling _file_package_mapping in case path doesn't exist + try: + lstat = (prefix / path).lstat() + except FileNotFoundError: + # FileNotFoundError: path doesn't exist + return + else: + yield from _file_package_mapping(prefix).get(lstat, ()) + + +@lru_cache(maxsize=None) +def _file_package_mapping(prefix: Path) -> dict[os.stat_result, set[PrefixRecord]]: + """Map paths to package records. + + We use lstat since a symlink doesn't clobber the file it points to. + """ + mapping: dict[os.stat_result, set[PrefixRecord]] = {} for prec in PrefixData(str(prefix)).iter_records(): - if any(samefile(prefix / file, path) for file in prec["files"]): - yield prec + for file in prec["files"]: + # packages are capable of removing files installed by other dependencies from + # the build prefix, in those cases lstat will fail, while which_package wont + # return the correct package(s) in such a condition we choose to not worry about + # it since this file to package lookup exists primarily to detect clobbering + try: + lstat = (prefix / file).lstat() + except FileNotFoundError: + # FileNotFoundError: path doesn't exist + continue + else: + mapping.setdefault(lstat, set()).add(prec) + return mapping def print_object_info(info, key): diff --git a/conda_build/utils.py b/conda_build/utils.py index 6c300737c1..3961e38bd6 100644 --- a/conda_build/utils.py +++ b/conda_build/utils.py @@ -2222,6 +2222,7 @@ def is_conda_pkg(pkg_path: str) -> bool: ) +@deprecated("3.28.3", "24.1.0") def samefile(path1: Path, path2: Path) -> bool: try: return path1.samefile(path2) diff --git a/news/5126-samefile-regression b/news/5126-samefile-regression new file mode 100644 index 0000000000..e3ec790882 --- /dev/null +++ b/news/5126-samefile-regression @@ -0,0 +1,19 @@ +### Enhancements + +* + +### Bug fixes + +* Update `which_package` to use a cached mapping of files to packages (`O(1)`) instead of relying on `Path.samefile` comparisons (`O(n * m)`). (#5126 via #5130) + +### Deprecations + +* + +### Docs + +* + +### Other + +* diff --git a/pyproject.toml b/pyproject.toml index a3477043d0..825ae5627f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -129,4 +129,5 @@ markers = [ "slow: execute the slow tests if active", "sanity: execute the sanity tests", "no_default_testing_config: used internally to disable monkeypatching for testing_config", + "benchmark: execute the benchmark tests", ] diff --git a/tests/test_config.py b/tests/test_config.py index 7c46ca0693..fa362a0b4f 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -7,7 +7,7 @@ import pytest from conda_build.config import Config, get_or_merge_config -from conda_build.utils import on_win, samefile +from conda_build.utils import on_win @pytest.fixture @@ -39,20 +39,25 @@ def test_keep_old_work(config: Config, build_id: str, tmp_path: Path): config.croot = tmp_path config.build_id = build_id + magic = "a_touched_file.magic" + # empty working directory orig_dir = Path(config.work_dir) + assert orig_dir.exists() assert not len(os.listdir(config.work_dir)) # touch a file so working directory is not empty - (orig_dir / "a_touched_file.magic").touch() - assert len(os.listdir(config.work_dir)) + (orig_dir / magic).touch() + assert orig_dir.exists() + assert len(os.listdir(config.work_dir)) == 1 + assert Path(config.work_dir, magic).exists() config.compute_build_id("a_new_name", reset=True) - # working directory should still exist and have the touched file - assert not samefile(orig_dir, config.work_dir) + # working directory should still exist (in new location) and have the touched file assert not orig_dir.exists() - assert len(os.listdir(config.work_dir)) + assert len(os.listdir(config.work_dir)) == 1 + assert Path(config.work_dir, magic).exists() @pytest.mark.skipif(on_win, reason="Windows uses only the short prefix") diff --git a/tests/test_inspect_pkg.py b/tests/test_inspect_pkg.py index edefa96a54..97aa9228db 100644 --- a/tests/test_inspect_pkg.py +++ b/tests/test_inspect_pkg.py @@ -3,8 +3,11 @@ from __future__ import annotations import json +import os from pathlib import Path +from uuid import uuid4 +import pytest from conda.core.prefix_data import PrefixData from conda_build.inspect_pkg import which_package @@ -100,7 +103,7 @@ def test_which_package(tmp_path: Path): precs_hardlinkA = list(which_package(tmp_path / "hardlinkA", tmp_path)) assert len(precs_hardlinkA) == 1 - assert precs_hardlinkA[0] == precA + assert set(precs_hardlinkA) == {precA} precs_shared = list(which_package(tmp_path / "shared", tmp_path)) assert len(precs_shared) == 2 @@ -108,12 +111,71 @@ def test_which_package(tmp_path: Path): precs_internal = list(which_package(tmp_path / "internal", tmp_path)) assert len(precs_internal) == 1 - assert precs_internal[0] == precA + assert set(precs_internal) == {precA} precs_external = list(which_package(tmp_path / "external", tmp_path)) - assert len(precs_external) == 2 - assert set(precs_external) == {precA, precB} + assert len(precs_external) == 1 + assert set(precs_external) == {precA} precs_hardlinkB = list(which_package(tmp_path / "hardlinkB", tmp_path)) - assert len(precs_hardlinkB) == 2 - assert set(precs_hardlinkB) == {precA, precB} + assert len(precs_hardlinkB) == 1 + assert set(precs_hardlinkB) == {precB} + + +@pytest.mark.benchmark +def test_which_package_battery(tmp_path: Path): + # regression: https://github.com/conda/conda-build/issues/5126 + # create a dummy environment + (tmp_path / "conda-meta").mkdir() + (tmp_path / "conda-meta" / "history").touch() + (tmp_path / "lib").mkdir() + + # dummy packages with files + removed = [] + for _ in range(100): + name = f"package_{uuid4().hex}" + + # mock a package with 100 files + files = [f"lib/{uuid4().hex}" for _ in range(100)] + for file in files: + (tmp_path / file).touch() + + # mock a removed file + remove = f"lib/{uuid4().hex}" + files.append(remove) + removed.append(remove) + + (tmp_path / "conda-meta" / f"{name}-1-0.json").write_text( + json.dumps( + { + "build": "0", + "build_number": 0, + "channel": f"{name}-channel", + "files": files, + "name": name, + "paths_data": { + "paths": [ + {"_path": file, "path_type": "hardlink", "size_in_bytes": 0} + for file in files + ], + "paths_version": 1, + }, + "version": "1", + } + ) + ) + + # every path should return exactly one package + for subdir, _, files in os.walk(tmp_path / "lib"): + for file in files: + path = Path(subdir, file) + + assert len(list(which_package(path, tmp_path))) == 1 + + # removed files should return no packages + # this occurs when, e.g., a package removes files installed by another package + for file in removed: + assert not len(list(which_package(tmp_path / file, tmp_path))) + + # missing files should return no packages + assert not len(list(which_package(tmp_path / "missing", tmp_path)))