diff --git a/dvc/ignore.py b/dvc/ignore.py index 946e5077e9..2ab78ea00a 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -6,8 +6,10 @@ from funcy import cached_property from pathspec.patterns import GitWildMatchPattern from pathspec.util import normalize_file +from pygtrie import StringTrie from dvc.path_info import PathInfo +from dvc.pathspec_math import merge_patterns from dvc.scm.tree import BaseTree from dvc.system import System from dvc.utils import relpath @@ -23,25 +25,33 @@ def __call__(self, root, dirs, files): class DvcIgnorePatterns(DvcIgnore): - def __init__(self, ignore_file_path, tree): - assert os.path.isabs(ignore_file_path) + def __init__(self, pattern_list, dirname): + + self.pattern_list = pattern_list + self.dirname = dirname + self.prefix = self.dirname + os.sep - self.ignore_file_path = ignore_file_path - self.dirname = os.path.normpath(os.path.dirname(ignore_file_path)) + regex_pattern_list = map( + GitWildMatchPattern.pattern_to_regex, pattern_list + ) + self.ignore_spec = [ + (ignore, re.compile("|".join(item[0] for item in group))) + for ignore, group in groupby(regex_pattern_list, lambda x: x[1]) + if ignore is not None + ] + + @classmethod + def from_files(cls, ignore_file_path, tree): + assert os.path.isabs(ignore_file_path) + dirname = os.path.normpath(os.path.dirname(ignore_file_path)) with tree.open(ignore_file_path, encoding="utf-8") as fobj: - path_spec_lines = fobj.readlines() - regex_pattern_list = map( - GitWildMatchPattern.pattern_to_regex, path_spec_lines - ) - self.ignore_spec = [ - (ignore, re.compile("|".join(item[0] for item in group))) - for ignore, group in groupby( - regex_pattern_list, lambda x: x[1] - ) - if ignore is not None + path_spec_lines = [ + line for line in map(str.strip, fobj.readlines()) if line ] + return cls(path_spec_lines, dirname) + def __call__(self, root, dirs, files): files = [f for f in files if not self.matches(root, f)] dirs = [d for d in dirs if not self.matches(root, d, True)] @@ -51,11 +61,10 @@ def __call__(self, root, dirs, files): def matches(self, dirname, basename, is_dir=False): # NOTE: `relpath` is too slow, so we have to assume that both # `dirname` and `self.dirname` are relative or absolute together. - prefix = self.dirname + os.sep if dirname == self.dirname: path = basename - elif dirname.startswith(prefix): - rel = dirname[len(prefix) :] + elif dirname.startswith(self.prefix): + rel = dirname[len(self.prefix) :] # NOTE: `os.path.join` is ~x5.5 slower path = f"{rel}{os.sep}{basename}" else: @@ -79,13 +88,47 @@ def ignore(self, path, is_dir): return result def __hash__(self): - return hash(self.ignore_file_path) + return hash(self.dirname + ":" + "\n".join(self.pattern_list)) def __eq__(self, other): if not isinstance(other, DvcIgnorePatterns): return NotImplemented + return (self.dirname == other.dirname) & ( + self.pattern_list == other.pattern_list + ) + + def __bool__(self): + return bool(self.pattern_list) - return self.ignore_file_path == other.ignore_file_path + +class DvcIgnorePatternsTrie(DvcIgnore): + trie = None + + def __init__(self): + if self.trie is None: + self.trie = StringTrie(separator=os.sep) + + def __call__(self, root, dirs, files): + ignore_pattern = self[root] + if ignore_pattern: + return ignore_pattern(root, dirs, files) + return dirs, files + + def __setitem__(self, root, ignore_pattern): + base_pattern = self[root] + common_dirname, merged_pattern = merge_patterns( + base_pattern.dirname, + base_pattern.pattern_list, + ignore_pattern.dirname, + ignore_pattern.pattern_list, + ) + self.trie[root] = DvcIgnorePatterns(merged_pattern, common_dirname) + + def __getitem__(self, root): + ignore_pattern = self.trie.longest_prefix(root) + if ignore_pattern: + return ignore_pattern.value + return DvcIgnorePatterns([], root) class DvcIgnoreDirs(DvcIgnore): @@ -127,14 +170,19 @@ def __init__(self, tree, root_dir): DvcIgnoreDirs([".git", ".hg", ".dvc"]), DvcIgnoreRepo(), } - for root, dirs, files in self.tree.walk(self.root_dir): - self._update(root) - dirs[:], files[:] = self(root, dirs, files) - - def _update(self, dirname): + ignore_pattern_trie = DvcIgnorePatternsTrie() + for root, dirs, _ in self.tree.walk(self.root_dir): + ignore_pattern = self._get_ignore_pattern(root) + if ignore_pattern: + ignore_pattern_trie[root] = ignore_pattern + self.ignores.add(ignore_pattern_trie) + dirs[:], _ = self(root, dirs, []) + + def _get_ignore_pattern(self, dirname): ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE) if self.tree.exists(ignore_file_path): - self.ignores.add(DvcIgnorePatterns(ignore_file_path, self.tree)) + return DvcIgnorePatterns.from_files(ignore_file_path, self.tree) + return None def __call__(self, root, dirs, files): for ignore in self.ignores: diff --git a/dvc/pathspec_math.py b/dvc/pathspec_math.py new file mode 100644 index 0000000000..7377fe4bc4 --- /dev/null +++ b/dvc/pathspec_math.py @@ -0,0 +1,85 @@ +# Path Specification Pattern Math +# Including changing base dir of path specification patterns and merging +# of two path specification patterns with different base +# All the operations follow the documents of `gitignore` +import os + +from pathspec.util import normalize_file + + +def _not_ignore(rule): + return (True, rule[1:]) if rule.startswith("!") else (False, rule) + + +def _is_comment(rule): + return rule.startswith("#") + + +def _remove_slash(rule): + if rule.startswith("\\"): + return rule[1:] + return rule + + +def _match_all_level(rule): + if rule[:-1].find("/") >= 0 and not rule.startswith("**/"): + if rule.startswith("/"): + rule = rule[1:] + return False, rule + if rule.startswith("**/"): + rule = rule[3:] + return True, rule + + +def change_rule(rule, rel): + rule = rule.strip() + if _is_comment(rule): + return rule + not_ignore, rule = _not_ignore(rule) + match_all, rule = _match_all_level(rule) + rule = _remove_slash(rule) + if not match_all: + rule = f"/{rule}" + else: + rule = f"/**/{rule}" + if not_ignore: + rule = f"!/{rel}{rule}" + else: + rule = f"/{rel}{rule}" + rule = normalize_file(rule) + return rule + + +def _change_dirname(dirname, pattern_list, new_dirname): + if new_dirname == dirname: + return pattern_list + rel = os.path.relpath(dirname, new_dirname) + if rel.startswith(".."): + raise ValueError("change dirname can only change to parent path") + + return [change_rule(rule, rel) for rule in pattern_list] + + +def merge_patterns(prefix_a, pattern_a, prefix_b, pattern_b): + """ + Merge two path specification patterns. + + This implementation merge two path specification patterns on different + bases. It returns the longest common parent directory, and the patterns + based on this new base directory. + """ + if not pattern_a: + return prefix_b, pattern_b + elif not pattern_b: + return prefix_a, pattern_a + + longest_common_dir = os.path.commonpath([prefix_a, prefix_b]) + new_pattern_a = _change_dirname(prefix_a, pattern_a, longest_common_dir) + new_pattern_b = _change_dirname(prefix_b, pattern_b, longest_common_dir) + + if len(prefix_a) < len(prefix_b): + merged_pattern = new_pattern_a + new_pattern_b + else: + merged_pattern = new_pattern_b + new_pattern_a + + return longest_common_dir, merged_pattern diff --git a/setup.cfg b/setup.cfg index f9dc5b216d..826c5f64fe 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,7 +17,7 @@ count=true [isort] include_trailing_comma=true known_first_party=dvc,tests -known_third_party=PyInstaller,RangeHTTPServer,boto3,colorama,configobj,distro,dpath,flaky,flufl,funcy,git,grandalf,mock,moto,nanotime,networkx,packaging,pathspec,pylint,pytest,requests,ruamel,setuptools,shortuuid,shtab,tqdm,voluptuous,yaml,zc +known_third_party=PyInstaller,RangeHTTPServer,boto3,colorama,configobj,distro,dpath,flaky,flufl,funcy,git,grandalf,mock,moto,nanotime,networkx,packaging,pathspec,pygtrie,pylint,pytest,requests,ruamel,setuptools,shortuuid,shtab,tqdm,voluptuous,yaml,zc line_length=79 force_grid_wrap=0 use_parentheses=True diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index ee9abd9431..c9b9feea14 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -8,6 +8,7 @@ DvcIgnore, DvcIgnoreDirs, DvcIgnorePatterns, + DvcIgnorePatternsTrie, DvcIgnoreRepo, ) from dvc.repo import Repo @@ -98,12 +99,19 @@ def test_ignore_collecting_dvcignores(tmp_dir, dvc, dname): assert len(dvc.tree.dvcignore.ignores) == 3 assert DvcIgnoreDirs([".git", ".hg", ".dvc"]) in dvc.tree.dvcignore.ignores + ignore_pattern_trie = None + for ignore in dvc.tree.dvcignore.ignores: + if isinstance(ignore, DvcIgnorePatternsTrie): + ignore_pattern_trie = ignore + + assert ignore_pattern_trie is not None assert ( - DvcIgnorePatterns( + DvcIgnorePatterns.from_files( os.fspath(top_ignore_file), WorkingTree(dvc.root_dir) ) - in dvc.tree.dvcignore.ignores + == ignore_pattern_trie[os.fspath(ignore_file)] ) + assert any( i for i in dvc.tree.dvcignore.ignores if isinstance(i, DvcIgnoreRepo) ) @@ -236,3 +244,102 @@ def test_ignore_directory(tmp_dir, dvc): assert _files_set("dir", dvc.tree) == { "dir/{}".format(DvcIgnore.DVCIGNORE_FILE), } + + +def test_multi_ignore_file(tmp_dir, dvc, monkeypatch): + tmp_dir.gen({"dir": {"subdir": {"should_ignore": "1", "not_ignore": "1"}}}) + tmp_dir.gen(DvcIgnore.DVCIGNORE_FILE, "dir/subdir/*_ignore") + tmp_dir.gen({"dir": {DvcIgnore.DVCIGNORE_FILE: "!subdir/not_ignore"}}) + + assert _files_set("dir", dvc.tree) == { + "dir/subdir/not_ignore", + "dir/{}".format(DvcIgnore.DVCIGNORE_FILE), + } + + +def test_pattern_trie_tree(tmp_dir, dvc): + tmp_dir.gen( + { + "top": { + "first": { + DvcIgnore.DVCIGNORE_FILE: "a\nb\nc", + "middle": { + "second": { + DvcIgnore.DVCIGNORE_FILE: "d\ne\nf", + "bottom": {}, + } + }, + }, + }, + "other": {DvcIgnore.DVCIGNORE_FILE: "1\n2\n3"}, + } + ) + ignore_pattern_trie = None + for ignore in dvc.tree.dvcignore.ignores: + if isinstance(ignore, DvcIgnorePatternsTrie): + ignore_pattern_trie = ignore + break + + assert ignore_pattern_trie is not None + ignore_pattern_top = ignore_pattern_trie[os.fspath(tmp_dir / "top")] + ignore_pattern_other = ignore_pattern_trie[os.fspath(tmp_dir / "other")] + ignore_pattern_first = ignore_pattern_trie[ + os.fspath(tmp_dir / "top" / "first") + ] + ignore_pattern_middle = ignore_pattern_trie[ + os.fspath(tmp_dir / "top" / "first" / "middle") + ] + ignore_pattern_second = ignore_pattern_trie[ + os.fspath(tmp_dir / "top" / "first" / "middle" / "second") + ] + ignore_pattern_bottom = ignore_pattern_trie[ + os.fspath(tmp_dir / "top" / "first" / "middle" / "second" / "bottom") + ] + assert not ignore_pattern_top + assert ( + DvcIgnorePatterns([], os.fspath(tmp_dir / "top")) == ignore_pattern_top + ) + assert ( + DvcIgnorePatterns(["1", "2", "3"], os.fspath(tmp_dir / "other")) + == ignore_pattern_other + ) + assert ( + DvcIgnorePatterns( + ["a", "b", "c"], os.fspath(tmp_dir / "top" / "first") + ) + == ignore_pattern_first + ) + assert ( + DvcIgnorePatterns( + ["a", "b", "c"], os.fspath(tmp_dir / "top" / "first") + ) + == ignore_pattern_middle + ) + assert ( + DvcIgnorePatterns( + [ + "a", + "b", + "c", + "/middle/second/**/d", + "/middle/second/**/e", + "/middle/second/**/f", + ], + os.fspath(tmp_dir / "top" / "first"), + ) + == ignore_pattern_second + ) + assert ( + DvcIgnorePatterns( + [ + "a", + "b", + "c", + "/middle/second/**/d", + "/middle/second/**/e", + "/middle/second/**/f", + ], + os.fspath(tmp_dir / "top" / "first"), + ) + == ignore_pattern_bottom + ) diff --git a/tests/unit/test_ignore.py b/tests/unit/test_ignore.py index 85c46edee4..0dcde9c910 100644 --- a/tests/unit/test_ignore.py +++ b/tests/unit/test_ignore.py @@ -9,7 +9,7 @@ def mock_dvcignore(dvcignore_path, patterns): tree = MagicMock() with patch.object(tree, "open", mock_open(read_data="\n".join(patterns))): - ignore_patterns = DvcIgnorePatterns(dvcignore_path, tree) + ignore_patterns = DvcIgnorePatterns.from_files(dvcignore_path, tree) return ignore_patterns diff --git a/tests/unit/test_pathspec_math.py b/tests/unit/test_pathspec_math.py new file mode 100644 index 0000000000..1cc6837153 --- /dev/null +++ b/tests/unit/test_pathspec_math.py @@ -0,0 +1,72 @@ +import pytest + +from dvc.pathspec_math import _change_dirname + + +@pytest.mark.parametrize( + "patterns, dirname, changed", + [ + # A line starting with # serves as a comment. + ("#comment", "/dir", "#comment"), + # Put a backslash ("\") in front of the first hash for patterns that + # begin with a hash. + ("\\#hash", "/dir", "/dir/**/#hash"), + ("\\#hash", "/#dir", "/#dir/**/#hash"), + # Trailing spaces are ignored unless they are quoted with + # backslash ("\"). + (" space", "/dir", "/dir/**/space"), + ("\\ space", "/dir", "/dir/**/ space"), + # An optional prefix "!" which negates the pattern; + ("!include", "/dir", "!/dir/**/include"), + # Put a backslash ("\") in front of the first "!" for patterns that + # begin with a literal "!", for example, "\!important!.txt". + ("\\!important!.txt", "/dir", "/dir/**/!important!.txt"), + # If there is a separator at the beginning or middle (or both) of the + # pattern, then the pattern is relative to the directory level of the + # particular .gitignore file itself. + ("/separator.txt", "/dir", "/dir/separator.txt"), + ("subdir/separator.txt", "/dir", "/dir/subdir/separator.txt"), + # Otherwise the pattern may also match at any level below + # the .gitignore level. + ("no_sep", "/dir", "/dir/**/no_sep"), + # If there is a separator at the end of the pattern then the pattern + # will only match directories, otherwise the pattern can match both + # files and directories. + ("doc/fortz/", "/dir", "/dir/doc/fortz/"), + ("fortz/", "/dir", "/dir/**/fortz/"), + # An asterisk "*" matches anything except a slash. + ("*aste*risk*", "/dir", "/dir/**/*aste*risk*"), + # The character "?" matches any one character except "/". + ("?fi?le?", "/dir", "/dir/**/?fi?le?"), + # The range notation, e.g. [a-zA-Z], can be used to match one of the + # characters in a range. See fnmatch(3) and the FNM_PATHNAME flag + # for a more detailed description. + ("[a-zA-Z]file[a-zA-Z]", "/dir", "/dir/**/[a-zA-Z]file[a-zA-Z]"), + # Two consecutive asterisks ("**") in patterns matched against full + # pathname may have special meaning: + # A leading "**" followed by a slash means match in all directories. + # For example, "**/foo" matches file or directory "foo" anywhere, + # the same as pattern "foo". + ("**/foo", "/dir", "/dir/**/foo"), + # "**/foo/bar" matches file or directory "bar" anywhere that is + # directly under directory "foo". + ("**/foo/bar", "/dir", "/dir/**/foo/bar"), + # A trailing "/**" matches everything inside. + # For example, "abc/**" matches all files inside directory "abc", + # relative to the location of the .gitignore file, with infinite depth. + ("abc/**", "/dir", "/dir/abc/**"), + # A slash followed by two consecutive asterisks then a slash matches + # zero or more directories. For example, "a/**/b" + # matches "a/b", "a/x/b", "a/x/y/b" and so on. + ("a/**/b", "/dir", "/dir/a/**/b"), + # Other consecutive asterisks are considered regular asterisks and + # will match according to the previous rules. + ("/***.txt", "/dir", "/dir/***.txt"), + ("data/***", "/dir", "/dir/data/***"), + ("***/file.txt", "/dir", "/dir/***/file.txt"), + ("***file", "/dir", "/dir/**/***file"), + ("a/***/b", "/dir", "/dir/a/***/b"), + ], +) +def test_dvcignore_pattern_change_dir(tmp_dir, patterns, dirname, changed): + assert _change_dirname(dirname, [patterns], "/") == [changed]