Skip to content

Commit

Permalink
Optimize ignore performance (#4120)
Browse files Browse the repository at this point in the history
* Remove Duplicate ignore Match

* Continue Optimize Dvcignore

fix #3869

* Add a new test with multi ignore files

* Solve merging two dvc files

* Solve Code Climate

* For Windows

* Complete addition of patterns. Add one test

* Systematic test

* Change request

* Change request

* Seperate path sepcification math

* Rename and add comment

* rename change_dirname to private

* Update dvc/pathspec_math.py

list comprehension

Co-authored-by: Alexander Schepanovski <[email protected]>

* Change request

* Update dvc/ignore.py

Co-authored-by: karajan1001 <[email protected]>
Co-authored-by: Alexander Schepanovski <[email protected]>
Co-authored-by: Ruslan Kuprieiev <[email protected]>
  • Loading branch information
4 people authored Jul 14, 2020
1 parent 66def03 commit 58f6534
Show file tree
Hide file tree
Showing 6 changed files with 341 additions and 29 deletions.
98 changes: 73 additions & 25 deletions dvc/ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
from funcy import cached_property
from pathspec.patterns import GitWildMatchPattern
from pathspec.util import normalize_file
from pygtrie import StringTrie

from dvc.path_info import PathInfo
from dvc.pathspec_math import merge_patterns
from dvc.scm.tree import BaseTree
from dvc.system import System
from dvc.utils import relpath
Expand All @@ -23,25 +25,33 @@ def __call__(self, root, dirs, files):


class DvcIgnorePatterns(DvcIgnore):
def __init__(self, ignore_file_path, tree):
assert os.path.isabs(ignore_file_path)
def __init__(self, pattern_list, dirname):

self.pattern_list = pattern_list
self.dirname = dirname
self.prefix = self.dirname + os.sep

self.ignore_file_path = ignore_file_path
self.dirname = os.path.normpath(os.path.dirname(ignore_file_path))
regex_pattern_list = map(
GitWildMatchPattern.pattern_to_regex, pattern_list
)

self.ignore_spec = [
(ignore, re.compile("|".join(item[0] for item in group)))
for ignore, group in groupby(regex_pattern_list, lambda x: x[1])
if ignore is not None
]

@classmethod
def from_files(cls, ignore_file_path, tree):
assert os.path.isabs(ignore_file_path)
dirname = os.path.normpath(os.path.dirname(ignore_file_path))
with tree.open(ignore_file_path, encoding="utf-8") as fobj:
path_spec_lines = fobj.readlines()
regex_pattern_list = map(
GitWildMatchPattern.pattern_to_regex, path_spec_lines
)
self.ignore_spec = [
(ignore, re.compile("|".join(item[0] for item in group)))
for ignore, group in groupby(
regex_pattern_list, lambda x: x[1]
)
if ignore is not None
path_spec_lines = [
line for line in map(str.strip, fobj.readlines()) if line
]

return cls(path_spec_lines, dirname)

def __call__(self, root, dirs, files):
files = [f for f in files if not self.matches(root, f)]
dirs = [d for d in dirs if not self.matches(root, d, True)]
Expand All @@ -51,11 +61,10 @@ def __call__(self, root, dirs, files):
def matches(self, dirname, basename, is_dir=False):
# NOTE: `relpath` is too slow, so we have to assume that both
# `dirname` and `self.dirname` are relative or absolute together.
prefix = self.dirname + os.sep
if dirname == self.dirname:
path = basename
elif dirname.startswith(prefix):
rel = dirname[len(prefix) :]
elif dirname.startswith(self.prefix):
rel = dirname[len(self.prefix) :]
# NOTE: `os.path.join` is ~x5.5 slower
path = f"{rel}{os.sep}{basename}"
else:
Expand All @@ -79,13 +88,47 @@ def ignore(self, path, is_dir):
return result

def __hash__(self):
return hash(self.ignore_file_path)
return hash(self.dirname + ":" + "\n".join(self.pattern_list))

def __eq__(self, other):
if not isinstance(other, DvcIgnorePatterns):
return NotImplemented
return (self.dirname == other.dirname) & (
self.pattern_list == other.pattern_list
)

def __bool__(self):
return bool(self.pattern_list)

return self.ignore_file_path == other.ignore_file_path

class DvcIgnorePatternsTrie(DvcIgnore):
trie = None

def __init__(self):
if self.trie is None:
self.trie = StringTrie(separator=os.sep)

def __call__(self, root, dirs, files):
ignore_pattern = self[root]
if ignore_pattern:
return ignore_pattern(root, dirs, files)
return dirs, files

def __setitem__(self, root, ignore_pattern):
base_pattern = self[root]
common_dirname, merged_pattern = merge_patterns(
base_pattern.dirname,
base_pattern.pattern_list,
ignore_pattern.dirname,
ignore_pattern.pattern_list,
)
self.trie[root] = DvcIgnorePatterns(merged_pattern, common_dirname)

def __getitem__(self, root):
ignore_pattern = self.trie.longest_prefix(root)
if ignore_pattern:
return ignore_pattern.value
return DvcIgnorePatterns([], root)


class DvcIgnoreDirs(DvcIgnore):
Expand Down Expand Up @@ -127,14 +170,19 @@ def __init__(self, tree, root_dir):
DvcIgnoreDirs([".git", ".hg", ".dvc"]),
DvcIgnoreRepo(),
}
for root, dirs, files in self.tree.walk(self.root_dir):
self._update(root)
dirs[:], files[:] = self(root, dirs, files)

def _update(self, dirname):
ignore_pattern_trie = DvcIgnorePatternsTrie()
for root, dirs, _ in self.tree.walk(self.root_dir):
ignore_pattern = self._get_ignore_pattern(root)
if ignore_pattern:
ignore_pattern_trie[root] = ignore_pattern
self.ignores.add(ignore_pattern_trie)
dirs[:], _ = self(root, dirs, [])

def _get_ignore_pattern(self, dirname):
ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE)
if self.tree.exists(ignore_file_path):
self.ignores.add(DvcIgnorePatterns(ignore_file_path, self.tree))
return DvcIgnorePatterns.from_files(ignore_file_path, self.tree)
return None

def __call__(self, root, dirs, files):
for ignore in self.ignores:
Expand Down
85 changes: 85 additions & 0 deletions dvc/pathspec_math.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Path Specification Pattern Math
# Including changing base dir of path specification patterns and merging
# of two path specification patterns with different base
# All the operations follow the documents of `gitignore`
import os

from pathspec.util import normalize_file


def _not_ignore(rule):
return (True, rule[1:]) if rule.startswith("!") else (False, rule)


def _is_comment(rule):
return rule.startswith("#")


def _remove_slash(rule):
if rule.startswith("\\"):
return rule[1:]
return rule


def _match_all_level(rule):
if rule[:-1].find("/") >= 0 and not rule.startswith("**/"):
if rule.startswith("/"):
rule = rule[1:]
return False, rule
if rule.startswith("**/"):
rule = rule[3:]
return True, rule


def change_rule(rule, rel):
rule = rule.strip()
if _is_comment(rule):
return rule
not_ignore, rule = _not_ignore(rule)
match_all, rule = _match_all_level(rule)
rule = _remove_slash(rule)
if not match_all:
rule = f"/{rule}"
else:
rule = f"/**/{rule}"
if not_ignore:
rule = f"!/{rel}{rule}"
else:
rule = f"/{rel}{rule}"
rule = normalize_file(rule)
return rule


def _change_dirname(dirname, pattern_list, new_dirname):
if new_dirname == dirname:
return pattern_list
rel = os.path.relpath(dirname, new_dirname)
if rel.startswith(".."):
raise ValueError("change dirname can only change to parent path")

return [change_rule(rule, rel) for rule in pattern_list]


def merge_patterns(prefix_a, pattern_a, prefix_b, pattern_b):
"""
Merge two path specification patterns.
This implementation merge two path specification patterns on different
bases. It returns the longest common parent directory, and the patterns
based on this new base directory.
"""
if not pattern_a:
return prefix_b, pattern_b
elif not pattern_b:
return prefix_a, pattern_a

longest_common_dir = os.path.commonpath([prefix_a, prefix_b])
new_pattern_a = _change_dirname(prefix_a, pattern_a, longest_common_dir)
new_pattern_b = _change_dirname(prefix_b, pattern_b, longest_common_dir)

if len(prefix_a) < len(prefix_b):
merged_pattern = new_pattern_a + new_pattern_b
else:
merged_pattern = new_pattern_b + new_pattern_a

return longest_common_dir, merged_pattern
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ count=true
[isort]
include_trailing_comma=true
known_first_party=dvc,tests
known_third_party=PyInstaller,RangeHTTPServer,boto3,colorama,configobj,distro,dpath,flaky,flufl,funcy,git,grandalf,mock,moto,nanotime,networkx,packaging,pathspec,pylint,pytest,requests,ruamel,setuptools,shortuuid,shtab,tqdm,voluptuous,yaml,zc
known_third_party=PyInstaller,RangeHTTPServer,boto3,colorama,configobj,distro,dpath,flaky,flufl,funcy,git,grandalf,mock,moto,nanotime,networkx,packaging,pathspec,pygtrie,pylint,pytest,requests,ruamel,setuptools,shortuuid,shtab,tqdm,voluptuous,yaml,zc
line_length=79
force_grid_wrap=0
use_parentheses=True
Expand Down
111 changes: 109 additions & 2 deletions tests/func/test_ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
DvcIgnore,
DvcIgnoreDirs,
DvcIgnorePatterns,
DvcIgnorePatternsTrie,
DvcIgnoreRepo,
)
from dvc.repo import Repo
Expand Down Expand Up @@ -98,12 +99,19 @@ def test_ignore_collecting_dvcignores(tmp_dir, dvc, dname):

assert len(dvc.tree.dvcignore.ignores) == 3
assert DvcIgnoreDirs([".git", ".hg", ".dvc"]) in dvc.tree.dvcignore.ignores
ignore_pattern_trie = None
for ignore in dvc.tree.dvcignore.ignores:
if isinstance(ignore, DvcIgnorePatternsTrie):
ignore_pattern_trie = ignore

assert ignore_pattern_trie is not None
assert (
DvcIgnorePatterns(
DvcIgnorePatterns.from_files(
os.fspath(top_ignore_file), WorkingTree(dvc.root_dir)
)
in dvc.tree.dvcignore.ignores
== ignore_pattern_trie[os.fspath(ignore_file)]
)

assert any(
i for i in dvc.tree.dvcignore.ignores if isinstance(i, DvcIgnoreRepo)
)
Expand Down Expand Up @@ -236,3 +244,102 @@ def test_ignore_directory(tmp_dir, dvc):
assert _files_set("dir", dvc.tree) == {
"dir/{}".format(DvcIgnore.DVCIGNORE_FILE),
}


def test_multi_ignore_file(tmp_dir, dvc, monkeypatch):
tmp_dir.gen({"dir": {"subdir": {"should_ignore": "1", "not_ignore": "1"}}})
tmp_dir.gen(DvcIgnore.DVCIGNORE_FILE, "dir/subdir/*_ignore")
tmp_dir.gen({"dir": {DvcIgnore.DVCIGNORE_FILE: "!subdir/not_ignore"}})

assert _files_set("dir", dvc.tree) == {
"dir/subdir/not_ignore",
"dir/{}".format(DvcIgnore.DVCIGNORE_FILE),
}


def test_pattern_trie_tree(tmp_dir, dvc):
tmp_dir.gen(
{
"top": {
"first": {
DvcIgnore.DVCIGNORE_FILE: "a\nb\nc",
"middle": {
"second": {
DvcIgnore.DVCIGNORE_FILE: "d\ne\nf",
"bottom": {},
}
},
},
},
"other": {DvcIgnore.DVCIGNORE_FILE: "1\n2\n3"},
}
)
ignore_pattern_trie = None
for ignore in dvc.tree.dvcignore.ignores:
if isinstance(ignore, DvcIgnorePatternsTrie):
ignore_pattern_trie = ignore
break

assert ignore_pattern_trie is not None
ignore_pattern_top = ignore_pattern_trie[os.fspath(tmp_dir / "top")]
ignore_pattern_other = ignore_pattern_trie[os.fspath(tmp_dir / "other")]
ignore_pattern_first = ignore_pattern_trie[
os.fspath(tmp_dir / "top" / "first")
]
ignore_pattern_middle = ignore_pattern_trie[
os.fspath(tmp_dir / "top" / "first" / "middle")
]
ignore_pattern_second = ignore_pattern_trie[
os.fspath(tmp_dir / "top" / "first" / "middle" / "second")
]
ignore_pattern_bottom = ignore_pattern_trie[
os.fspath(tmp_dir / "top" / "first" / "middle" / "second" / "bottom")
]
assert not ignore_pattern_top
assert (
DvcIgnorePatterns([], os.fspath(tmp_dir / "top")) == ignore_pattern_top
)
assert (
DvcIgnorePatterns(["1", "2", "3"], os.fspath(tmp_dir / "other"))
== ignore_pattern_other
)
assert (
DvcIgnorePatterns(
["a", "b", "c"], os.fspath(tmp_dir / "top" / "first")
)
== ignore_pattern_first
)
assert (
DvcIgnorePatterns(
["a", "b", "c"], os.fspath(tmp_dir / "top" / "first")
)
== ignore_pattern_middle
)
assert (
DvcIgnorePatterns(
[
"a",
"b",
"c",
"/middle/second/**/d",
"/middle/second/**/e",
"/middle/second/**/f",
],
os.fspath(tmp_dir / "top" / "first"),
)
== ignore_pattern_second
)
assert (
DvcIgnorePatterns(
[
"a",
"b",
"c",
"/middle/second/**/d",
"/middle/second/**/e",
"/middle/second/**/f",
],
os.fspath(tmp_dir / "top" / "first"),
)
== ignore_pattern_bottom
)
2 changes: 1 addition & 1 deletion tests/unit/test_ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
def mock_dvcignore(dvcignore_path, patterns):
tree = MagicMock()
with patch.object(tree, "open", mock_open(read_data="\n".join(patterns))):
ignore_patterns = DvcIgnorePatterns(dvcignore_path, tree)
ignore_patterns = DvcIgnorePatterns.from_files(dvcignore_path, tree)

return ignore_patterns

Expand Down
Loading

0 comments on commit 58f6534

Please sign in to comment.