diff --git a/dvc/ignore.py b/dvc/ignore.py index 0e80a02103..71569b38b3 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -6,12 +6,12 @@ from pathspec.patterns import GitWildMatchPattern from pathspec.util import normalize_file -from pygtrie import StringTrie from dvc.path_info import PathInfo from dvc.pathspec_math import PatternInfo, merge_patterns from dvc.system import System from dvc.utils import relpath +from dvc.utils.collections import PathStringTrie logger = logging.getLogger(__name__) @@ -156,7 +156,7 @@ class DvcIgnoreFilterNoop: def __init__(self, tree, root_dir): pass - def __call__(self, root, dirs, files): + def __call__(self, root, dirs, files, **kwargs): return dirs, files def is_ignored_dir(self, _): @@ -183,10 +183,11 @@ def __init__(self, tree, root_dir): self.tree = tree self.root_dir = root_dir - self.ignores_trie_tree = StringTrie(separator=os.sep) + self.ignores_trie_tree = PathStringTrie() self.ignores_trie_tree[root_dir] = DvcIgnorePatterns( default_ignore_patterns, root_dir ) + self._ignored_subrepos = PathStringTrie() self._update(self.root_dir) def _update(self, dirname): @@ -222,7 +223,10 @@ def _update(self, dirname): def _update_sub_repo(self, root, dirs): for d in dirs: if self._is_dvc_repo(root, d): - new_pattern = DvcIgnorePatterns(["/{}/".format(d)], root) + self._ignored_subrepos[root] = self._ignored_subrepos.get( + root, set() + ) | {d} + new_pattern = DvcIgnorePatterns([f"/{d}/"], root) old_pattern = self.ignores_trie_tree.longest_prefix(root).value if old_pattern: self.ignores_trie_tree[root] = DvcIgnorePatterns( @@ -236,12 +240,13 @@ def _update_sub_repo(self, root, dirs): else: self.ignores_trie_tree[root] = new_pattern - def __call__(self, root, dirs, files): + def __call__(self, root, dirs, files, ignore_subrepos=True): ignore_pattern = self._get_trie_pattern(root) if ignore_pattern: - return ignore_pattern(root, dirs, files) - else: - return dirs, files + dirs, files = ignore_pattern(root, dirs, files) + if not ignore_subrepos: + dirs.extend(self._ignored_subrepos.get(root, [])) + return dirs, files def _get_trie_pattern(self, dirname): ignore_pattern = self.ignores_trie_tree.get(dirname) @@ -277,8 +282,14 @@ def _is_ignored(self, path, is_dir=False): else: return False - def is_ignored_dir(self, path): + def _is_subrepo(self, path): + dirname, basename = os.path.split(os.path.normpath(path)) + return basename in self._ignored_subrepos.get(dirname, set()) + + def is_ignored_dir(self, path, ignore_subrepos=True): path = os.path.abspath(path) + if not ignore_subrepos: + return not self._is_subrepo(path) if path == self.root_dir: return False diff --git a/dvc/tree/git.py b/dvc/tree/git.py index edf49964a7..e4f2548982 100644 --- a/dvc/tree/git.py +++ b/dvc/tree/git.py @@ -84,13 +84,15 @@ def exists( path ) and not self.dvcignore.is_ignored_dir(path) - def isdir(self, path): # pylint: disable=arguments-differ + def isdir( + self, path, use_dvcignore=True + ): # pylint: disable=arguments-differ obj = self._git_object_by_path(path) if obj is None: return False if obj.mode != GIT_MODE_DIR: return False - return not self.dvcignore.is_ignored_dir(path) + return not (use_dvcignore and self.dvcignore.is_ignored_dir(path)) def isfile(self, path): # pylint: disable=arguments-differ obj = self._git_object_by_path(path) @@ -156,7 +158,14 @@ def _walk(self, tree, topdown=True): if not topdown: yield os.path.normpath(tree.abspath), dirs, nondirs - def walk(self, top, topdown=True, onerror=None, use_dvcignore=True): + def walk( + self, + top, + topdown=True, + onerror=None, + use_dvcignore=True, + ignore_subrepos=True, + ): """Directory tree generator. See `os.walk` for the docs. Differences: @@ -176,7 +185,10 @@ def walk(self, top, topdown=True, onerror=None, use_dvcignore=True): for root, dirs, files in self._walk(tree, topdown=topdown): if use_dvcignore: dirs[:], files[:] = self.dvcignore( - os.path.abspath(root), dirs, files + os.path.abspath(root), + dirs, + files, + ignore_subrepos=ignore_subrepos, ) yield root, dirs, files diff --git a/dvc/tree/local.py b/dvc/tree/local.py index bfce3345f8..ca3b8cfbfe 100644 --- a/dvc/tree/local.py +++ b/dvc/tree/local.py @@ -86,18 +86,26 @@ def isfile(self, path_info): return not self.dvcignore.is_ignored_file(path_info) - def isdir(self, path_info): + def isdir( + self, path_info, use_dvcignore=True + ): # pylint: disable=arguments-differ if not os.path.isdir(path_info): return False - - return not self.dvcignore.is_ignored_dir(path_info) + return not (use_dvcignore and self.dvcignore.is_ignored_dir(path_info)) def iscopy(self, path_info): return not ( System.is_symlink(path_info) or System.is_hardlink(path_info) ) - def walk(self, top, topdown=True, onerror=None, use_dvcignore=True): + def walk( + self, + top, + topdown=True, + onerror=None, + use_dvcignore=True, + ignore_subrepos=True, + ): """Directory tree generator. See `os.walk` for the docs. Differences: @@ -108,7 +116,10 @@ def walk(self, top, topdown=True, onerror=None, use_dvcignore=True): ): if use_dvcignore: dirs[:], files[:] = self.dvcignore( - os.path.abspath(root), dirs, files + os.path.abspath(root), + dirs, + files, + ignore_subrepos=ignore_subrepos, ) yield os.path.normpath(root), dirs, files diff --git a/dvc/utils/collections.py b/dvc/utils/collections.py index 1ff72a3472..d59869b616 100644 --- a/dvc/utils/collections.py +++ b/dvc/utils/collections.py @@ -1,5 +1,16 @@ +import os from collections.abc import Mapping +from pygtrie import StringTrie as _StringTrie + + +class PathStringTrie(_StringTrie): + """Trie based on platform-dependent separator for pathname components.""" + + def __init__(self, *args, **kwargs): + kwargs["separator"] = os.sep + super().__init__(*args, **kwargs) + def apply_diff(src, dest): """Recursively apply changes from src to dest. diff --git a/tests/dir_helpers.py b/tests/dir_helpers.py index 02ae154fd6..6d1abf8137 100644 --- a/tests/dir_helpers.py +++ b/tests/dir_helpers.py @@ -74,6 +74,8 @@ class TmpDir(pathlib.Path): + scheme = "local" + def __new__(cls, *args, **kwargs): if cls is TmpDir: cls = ( # pylint: disable=self-cls-assignment @@ -87,7 +89,7 @@ def __new__(cls, *args, **kwargs): self._init() return self - def init(self, *, scm=False, dvc=False): + def init(self, *, scm=False, dvc=False, subdir=False): from dvc.repo import Repo from dvc.scm.git import Git @@ -100,7 +102,9 @@ def init(self, *, scm=False, dvc=False): git_init(str_path) if dvc: self.dvc = Repo.init( - str_path, no_scm=not scm and not hasattr(self, "scm") + str_path, + no_scm=not scm and not hasattr(self, "scm"), + subdir=subdir, ) if scm: self.scm = self.dvc.scm if hasattr(self, "dvc") else Git(str_path) @@ -123,10 +127,10 @@ def gen(self, struct, text=""): if isinstance(struct, (str, bytes, pathlib.PurePath)): struct = {struct: text} - self._gen(struct) - return struct.keys() + return self._gen(struct) def _gen(self, struct, prefix=None): + paths = [] for name, contents in struct.items(): path = (prefix or self) / name @@ -141,6 +145,8 @@ def _gen(self, struct, prefix=None): path.write_bytes(contents) else: path.write_text(contents, encoding="utf-8") + paths.append(path) + return paths def dvc_gen(self, struct, text="", commit=None): paths = self.gen(struct, text) @@ -249,10 +255,10 @@ class PosixTmpDir(TmpDir, pathlib.PurePosixPath): @pytest.fixture(scope="session") def make_tmp_dir(tmp_path_factory, request): - def make(name, *, scm=False, dvc=False): + def make(name, *, scm=False, dvc=False, **kwargs): path = tmp_path_factory.mktemp(name) if isinstance(name, str) else name new_dir = TmpDir(path) - new_dir.init(scm=scm, dvc=dvc) + new_dir.init(scm=scm, dvc=dvc, **kwargs) request.addfinalizer(new_dir.close) return new_dir diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index 67e8f71a5a..042329f79c 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -182,6 +182,28 @@ def test_ignore_subrepo(tmp_dir, scm, dvc): assert subrepo.tree.exists(PathInfo(subrepo_dir / "foo")) +def test_ignore_resurface_subrepo(tmp_dir, scm, dvc): + tmp_dir.dvc_gen({"foo": "foo"}, commit="add foo") + subrepo_dir = tmp_dir / "subdir" + subrepo_dir.mkdir() + with subrepo_dir.chdir(): + Repo.init(subdir=True) + + dvc.tree.__dict__.pop("dvcignore", None) + + dirs = ["subdir"] + files = ["foo"] + assert dvc.tree.dvcignore(os.fspath(tmp_dir), dirs, files) == ([], files) + assert dvc.tree.dvcignore( + os.fspath(tmp_dir), dirs, files, ignore_subrepos=False + ) == (dirs, files) + + assert dvc.tree.dvcignore.is_ignored_dir(os.fspath(subrepo_dir)) + assert not dvc.tree.dvcignore.is_ignored_dir( + os.fspath(subrepo_dir), ignore_subrepos=False + ) + + def test_ignore_blank_line(tmp_dir, dvc): tmp_dir.gen({"dir": {"ignored": "text", "other": "text2"}}) tmp_dir.gen(DvcIgnore.DVCIGNORE_FILE, "foo\n\ndir/ignored") diff --git a/tests/func/test_tree.py b/tests/func/test_tree.py index 3ab93efc9b..ad09e5312f 100644 --- a/tests/func/test_tree.py +++ b/tests/func/test_tree.py @@ -1,4 +1,5 @@ import os +from operator import itemgetter from os.path import join from dvc.path_info import PathInfo @@ -245,3 +246,25 @@ def test_cleantree_subrepo(tmp_dir, dvc, scm, monkeypatch): assert subrepo.tree.isfile(path / "foo") assert subrepo.tree.exists(path / "dir") assert subrepo.tree.isdir(path / "dir") + + +def test_walk_dont_ignore_subrepos(tmp_dir, scm, dvc): + tmp_dir.dvc_gen({"foo": "foo"}, commit="add foo") + subrepo_dir = tmp_dir / "subdir" + subrepo_dir.mkdir() + with subrepo_dir.chdir(): + Repo.init(subdir=True) + scm.add(["subdir"]) + scm.commit("Add subrepo") + + dvc_tree = dvc.tree + scm_tree = scm.get_tree("HEAD", use_dvcignore=True) + path = os.fspath(tmp_dir) + get_dirs = itemgetter(1) + + assert get_dirs(next(dvc_tree.walk(path))) == [] + assert get_dirs(next(scm_tree.walk(path))) == [] + + kw = dict(ignore_subrepos=False) + assert get_dirs(next(dvc_tree.walk(path, **kw))) == ["subdir"] + assert get_dirs(next(scm_tree.walk(path, **kw))) == ["subdir"]