Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dvc: get rid of CleanTree #4221

Merged
merged 2 commits into from
Jul 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dvc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ def __init__(
self.dvc_dir = os.path.abspath(os.path.realpath(dvc_dir))

self.wtree = LocalRemoteTree(None, {"url": self.dvc_dir})
self.tree = tree.tree if tree else self.wtree
self.tree = tree or self.wtree

self.load(validate=validate)

Expand Down
122 changes: 34 additions & 88 deletions dvc/ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,12 @@
import re
from itertools import groupby

from funcy import cached_property
from pathspec.patterns import GitWildMatchPattern
from pathspec.util import normalize_file
from pygtrie import StringTrie

from dvc.path_info import PathInfo
from dvc.pathspec_math import merge_patterns
from dvc.scm.tree import BaseTree
from dvc.system import System
from dvc.utils import relpath

Expand Down Expand Up @@ -162,6 +160,20 @@ def is_dvc_repo(directory):
return dirs, files


class DvcIgnoreFilterNoop:
def __init__(self, tree, root_dir):
pass

def __call__(self, root, dirs, files):
return dirs, files

def is_ignored_dir(self, _):
return False

def is_ignored_file(self, _):
return False


class DvcIgnoreFilter:
def __init__(self, tree, root_dir):
self.tree = tree
Expand Down Expand Up @@ -190,118 +202,52 @@ def __call__(self, root, dirs, files):

return dirs, files

def is_ignored_dir(self, path):
if not self._parents_exist(path):
return True

class CleanTree(BaseTree):
def __init__(self, tree, tree_root=None):
self.tree = tree
if tree_root:
self._tree_root = tree_root
else:
self._tree_root = self.tree.tree_root

@cached_property
def dvcignore(self):
return DvcIgnoreFilter(self.tree, self.tree_root)

@property
def tree_root(self):
return self._tree_root

def open(self, path, mode="r", encoding="utf-8"):
if self.isfile(path):
return self.tree.open(path, mode, encoding)
raise FileNotFoundError

def exists(self, path):
if self.tree.exists(path) and self._parents_exist(path):
if self.tree.isdir(path):
return self._valid_dirname(path)
return self._valid_filename(path)
return False

def isdir(self, path):
return (
self.tree.isdir(path)
and self._parents_exist(path)
and self._valid_dirname(path)
)

def _valid_dirname(self, path):
path = os.path.abspath(path)
if path == self.tree_root:
return True
if path == self.root_dir:
return False
dirname, basename = os.path.split(path)
dirs, _ = self.dvcignore(dirname, [basename], [])
if dirs:
return True
return False
dirs, _ = self(dirname, [basename], [])
return not dirs

def isfile(self, path):
return (
self.tree.isfile(path)
and self._parents_exist(path)
and self._valid_filename(path)
)

def _valid_filename(self, path):
dirname, basename = os.path.split(os.path.normpath(path))
_, files = self.dvcignore(os.path.abspath(dirname), [], [basename])
if files:
def is_ignored_file(self, path):
if not self._parents_exist(path):
return True
return False

def isexec(self, path):
return self.exists(path) and self.tree.isexec(path)
dirname, basename = os.path.split(os.path.normpath(path))
_, files = self(os.path.abspath(dirname), [], [basename])
return not files

def _parents_exist(self, path):
from dvc.repo import Repo

path = PathInfo(path)

# if parent is tree_root or inside a .dvc dir we can skip this check
if path.parent == self.tree_root or Repo.DVC_DIR in path.parts:
# if parent is root_dir or inside a .dvc dir we can skip this check
if path.parent == self.root_dir or Repo.DVC_DIR in path.parts:
return True

# paths outside of the CleanTree root should be ignored
path = relpath(path, self.tree_root)
# paths outside of the repo should be ignored
path = relpath(path, self.root_dir)
if path.startswith("..") or (
os.name == "nt"
and not os.path.commonprefix(
[os.path.abspath(path), self.tree_root]
[os.path.abspath(path), self.root_dir]
)
):
return False

# check if parent directories are in our ignores, starting from
# tree_root
# root_dir
for parent_dir in reversed(PathInfo(path).parents):
dirname, basename = os.path.split(parent_dir)
if basename == ".":
# parent_dir == tree_root
# parent_dir == root_dir
continue
dirs, _ = self.dvcignore(os.path.abspath(dirname), [basename], [])
dirs, _ = self(os.path.abspath(dirname), [basename], [])
if not dirs:
return False
return True

def walk(self, top, topdown=True, onerror=None):
for root, dirs, files in self.tree.walk(
top, topdown=topdown, onerror=onerror
):
dirs[:], files[:] = self.dvcignore(
os.path.abspath(root), dirs, files
)

yield root, dirs, files

def stat(self, path):
if self.exists(path):
return self.tree.stat(path)
raise FileNotFoundError

@property
def hash_jobs(self):
return self.tree.hash_jobs

def makedirs(self, path, mode=0o777, exist_ok=True):
self.tree.makedirs(path, mode=mode, exist_ok=exist_ok)
23 changes: 11 additions & 12 deletions dvc/repo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,8 @@
NotDvcRepoError,
OutputNotFoundError,
)
from dvc.ignore import CleanTree
from dvc.path_info import PathInfo
from dvc.repo.tree import RepoTree
from dvc.scm.tree import is_working_tree
from dvc.utils.fs import path_isin

from ..stage.exceptions import StageFileDoesNotExistError, StageNotFound
Expand Down Expand Up @@ -85,12 +83,19 @@ def __init__(self, root_dir=None, scm=None, rev=None):
tree = scm.get_tree(rev)
self.root_dir = self.find_root(root_dir, tree)
self.scm = scm
self.tree = tree
self.tree = scm.get_tree(
rev, use_dvcignore=True, dvcignore_root=self.root_dir
)
self.state = StateNoop()
else:
root_dir = self.find_root(root_dir)
self.root_dir = os.path.abspath(os.path.realpath(root_dir))
self.tree = LocalRemoteTree(None, {"url": self.root_dir})
self.tree = LocalRemoteTree(
self,
{"url": self.root_dir},
use_dvcignore=True,
dvcignore_root=self.root_dir,
)

self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR)
self.config = Config(self.dvc_dir, tree=self.tree)
Expand Down Expand Up @@ -134,13 +139,7 @@ def tree(self):

@tree.setter
def tree(self, tree):
if is_working_tree(tree) or tree.tree_root == self.root_dir:
root = None
else:
root = self.root_dir
self._tree = (
tree if isinstance(tree, CleanTree) else CleanTree(tree, root)
)
self._tree = tree
# Our graph cache is no longer valid, as it was based on the previous
# tree.
self._reset()
Expand Down Expand Up @@ -605,4 +604,4 @@ def _reset(self):
self.__dict__.pop("graph", None)
self.__dict__.pop("stages", None)
self.__dict__.pop("pipelines", None)
self.__dict__.pop("dvcignore", None)
self.tree.__dict__.pop("dvcignore", None)
12 changes: 6 additions & 6 deletions dvc/repo/add.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,17 +109,17 @@ def add(
def _find_all_targets(repo, target, recursive):
if os.path.isdir(target) and recursive:
return [
fname
for fname in Tqdm(
os.fspath(path)
for path in Tqdm(
repo.tree.walk_files(target),
desc="Searching " + target,
bar_format=Tqdm.BAR_FMT_NOTOTAL,
unit="file",
)
if not repo.is_dvc_internal(fname)
if not is_dvc_file(fname)
if not repo.scm.belongs_to_scm(fname)
if not repo.scm.is_tracked(fname)
if not repo.is_dvc_internal(os.fspath(path))
if not is_dvc_file(os.fspath(path))
if not repo.scm.belongs_to_scm(os.fspath(path))
if not repo.scm.is_tracked(os.fspath(path))
]
return [target]

Expand Down
8 changes: 6 additions & 2 deletions dvc/repo/brancher.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ def brancher( # noqa: E302

scm = self.scm

self.tree = LocalRemoteTree(self, {"url": self.root_dir})
self.tree = LocalRemoteTree(
self, {"url": self.root_dir}, use_dvcignore=True
)
yield "workspace"

if revs and "workspace" in revs:
Expand All @@ -47,7 +49,9 @@ def brancher( # noqa: E302
try:
if revs:
for sha, names in group_by(scm.resolve_rev, revs).items():
self.tree = scm.get_tree(sha)
self.tree = scm.get_tree(
sha, use_dvcignore=True, dvcignore_root=self.root_dir
)
# ignore revs that don't contain repo root
# (i.e. revs from before a subdir=True repo was init'ed)
if self.tree.exists(self.root_dir):
Expand Down
4 changes: 2 additions & 2 deletions dvc/scm/git/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,8 +375,8 @@ def belongs_to_scm(self, path):
path_parts = os.path.normpath(path).split(os.path.sep)
return basename == self.ignore_file or Git.GIT_DIR in path_parts

def get_tree(self, rev):
return GitTree(self.repo, self.resolve_rev(rev))
def get_tree(self, rev, **kwargs):
return GitTree(self.repo, self.resolve_rev(rev), **kwargs)

def get_rev(self):
return self.repo.rev_parse("HEAD").hexsha
Expand Down
39 changes: 34 additions & 5 deletions dvc/scm/git/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import os
import stat

from funcy import cached_property

from dvc.exceptions import DvcException
from dvc.scm.tree import BaseTree
from dvc.utils import relpath
Expand All @@ -22,7 +24,7 @@ def _item_basename(item):
class GitTree(BaseTree): # pylint:disable=abstract-method
"""Proxies the repo file access methods to Git objects"""

def __init__(self, git, rev):
def __init__(self, git, rev, use_dvcignore=False, dvcignore_root=None):
"""Create GitTree instance

Args:
Expand All @@ -31,11 +33,25 @@ def __init__(self, git, rev):
"""
self.git = git
self.rev = rev
self.use_dvcignore = use_dvcignore
self.dvcignore_root = dvcignore_root

@property
def tree_root(self):
return self.git.working_dir

@cached_property
def dvcignore(self):
from dvc.ignore import DvcIgnoreFilter, DvcIgnoreFilterNoop

root = self.dvcignore_root or self.tree_root
if not self.use_dvcignore:
return DvcIgnoreFilterNoop(self, root)
self.use_dvcignore = False
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Avoiding recursion. Could wrap this hack in try: finally but will be replaced in the following patch anyway.

ret = DvcIgnoreFilter(self, root)
self.use_dvcignore = True
return ret

def open(self, path, mode="r", encoding="utf-8"):
assert mode in {"r", "rb"}

Expand All @@ -58,20 +74,29 @@ def open(self, path, mode="r", encoding="utf-8"):
return io.StringIO(data.decode(encoding))

def exists(self, path):
return self._git_object_by_path(path) is not None
if self._git_object_by_path(path) is None:
return False

return not self.dvcignore.is_ignored_file(
path
) and not self.dvcignore.is_ignored_dir(path)
Comment on lines +77 to +82
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just mimicing old CleanTree. Could reconsider whether or not we really need to deny direct access here. E.g. you could force git add for a gitignored file.


def isdir(self, path):
obj = self._git_object_by_path(path)
if obj is None:
return False
return obj.mode == GIT_MODE_DIR
if obj.mode != GIT_MODE_DIR:
return False
return not self.dvcignore.is_ignored_dir(path)

def isfile(self, path):
obj = self._git_object_by_path(path)
if obj is None:
return False
# according to git-fast-import(1) file mode could be 644 or 755
return obj.mode & GIT_MODE_FILE == GIT_MODE_FILE
if obj.mode & GIT_MODE_FILE != GIT_MODE_FILE:
return False
return not self.dvcignore.is_ignored_file(path)

@staticmethod
def _is_tree_and_contains(obj, path):
Expand Down Expand Up @@ -145,7 +170,11 @@ def walk(self, top, topdown=True, onerror=None):
onerror(NotADirectoryError(top))
return

yield from self._walk(tree, topdown=topdown)
for root, dirs, files in self._walk(tree, topdown=topdown):
dirs[:], files[:] = self.dvcignore(
os.path.abspath(root), dirs, files
)
yield root, dirs, files

def isexec(self, path):
if not self.exists(path):
Expand Down
2 changes: 1 addition & 1 deletion dvc/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def __init__(self, local_cache):
repo = local_cache.repo
self.repo = repo
self.root_dir = repo.root_dir
self.tree = LocalRemoteTree(None, {})
self.tree = LocalRemoteTree(None, {"url": self.root_dir})

state_config = repo.config.get("state", {})
self.row_limit = state_config.get("row_limit", self.STATE_ROW_LIMIT)
Expand Down
Loading