From 33abc407a1b4ae6aef3e1e83afe39d57a01d8ec7 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Thu, 16 Jul 2020 08:54:05 +0300 Subject: [PATCH] dvc: get rid of WorkingTree (#4216) Working tree is really just a regular local tree and should be used by outputs when trying to compute a hash for themselves. We didn't use it previously because local tree was embeded into the local remote class. Related to #4050 --- dvc/config.py | 6 ++-- dvc/repo/__init__.py | 5 ++- dvc/repo/brancher.py | 4 +-- dvc/scm/tree.py | 66 +++---------------------------------- dvc/state.py | 8 +++-- dvc/tree/local.py | 50 +++++++++++++++++----------- dvc/utils/fs.py | 2 +- tests/func/test_ignore.py | 10 +++--- tests/func/test_tree.py | 32 +++++++++--------- tests/unit/utils/test_fs.py | 6 ++-- 10 files changed, 74 insertions(+), 115 deletions(-) diff --git a/dvc/config.py b/dvc/config.py index 92cfde7fe2..3b083597a7 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -234,7 +234,7 @@ class Config(dict): def __init__( self, dvc_dir=None, validate=True, tree=None, ): # pylint: disable=super-init-not-called - from dvc.scm.tree import WorkingTree + from dvc.tree.local import LocalRemoteTree self.dvc_dir = dvc_dir @@ -248,7 +248,7 @@ def __init__( else: self.dvc_dir = os.path.abspath(os.path.realpath(dvc_dir)) - self.wtree = WorkingTree(self.dvc_dir) + self.wtree = LocalRemoteTree(None, {"url": self.dvc_dir}) self.tree = tree.tree if tree else self.wtree self.load(validate=validate) @@ -326,7 +326,7 @@ def _save_config(self, level, conf_dict): logger.debug(f"Writing '{filename}'.") - tree.makedirs(os.path.dirname(filename), exist_ok=True) + tree.makedirs(os.path.dirname(filename)) config = configobj.ConfigObj(_pack_remotes(conf_dict)) with tree.open(filename, "wb") as fobj: diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 23ebe38a4a..a1c3bc7554 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -77,12 +77,11 @@ def __init__(self, root_dir=None, scm=None, rev=None): from dvc.repo.metrics import Metrics from dvc.repo.plots import Plots from dvc.repo.params import Params - from dvc.scm.tree import WorkingTree + from dvc.tree.local import LocalRemoteTree from dvc.utils.fs import makedirs from dvc.stage.cache import StageCache if scm: - # use GitTree instead of WorkingTree as default repo tree instance tree = scm.get_tree(rev) self.root_dir = self.find_root(root_dir, tree) self.scm = scm @@ -91,7 +90,7 @@ def __init__(self, root_dir=None, scm=None, rev=None): else: root_dir = self.find_root(root_dir) self.root_dir = os.path.abspath(os.path.realpath(root_dir)) - self.tree = WorkingTree(self.root_dir) + self.tree = LocalRemoteTree(None, {"url": self.root_dir}) self.dvc_dir = os.path.join(self.root_dir, self.DVC_DIR) self.config = Config(self.dvc_dir, tree=self.tree) diff --git a/dvc/repo/brancher.py b/dvc/repo/brancher.py index 9f231535d8..c42aa556c6 100644 --- a/dvc/repo/brancher.py +++ b/dvc/repo/brancher.py @@ -1,6 +1,6 @@ from funcy import group_by -from dvc.scm.tree import WorkingTree +from dvc.tree.local import LocalRemoteTree def brancher( # noqa: E302 @@ -29,7 +29,7 @@ def brancher( # noqa: E302 scm = self.scm - self.tree = WorkingTree(self.root_dir) + self.tree = LocalRemoteTree(self, {"url": self.root_dir}) yield "workspace" if revs and "workspace" in revs: diff --git a/dvc/scm/tree.py b/dvc/scm/tree.py index d89f97ee98..0d8e91f618 100644 --- a/dvc/scm/tree.py +++ b/dvc/scm/tree.py @@ -1,8 +1,4 @@ import os -import stat -from multiprocessing import cpu_count - -from funcy import cached_property class BaseTree: @@ -41,63 +37,9 @@ def makedirs(self, path, mode=0o777, exist_ok=True): raise NotImplementedError -class WorkingTree(BaseTree): - """Proxies the repo file access methods to working tree files""" - - def __init__(self, repo_root=None): - repo_root = repo_root or os.getcwd() - self.repo_root = repo_root - - @property - def tree_root(self): - return self.repo_root - - def open(self, path, mode="r", encoding="utf-8"): - """Open file and return a stream.""" - if "b" in mode: - encoding = None - return open(path, mode=mode, encoding=encoding) - - def exists(self, path): - """Test whether a path exists.""" - return os.path.lexists(path) - - def isdir(self, path): - """Return true if the pathname refers to an existing directory.""" - return os.path.isdir(path) - - def isfile(self, path): - """Test whether a path is a regular file""" - return os.path.isfile(path) - - def walk(self, top, topdown=True, onerror=None): - """Directory tree generator. - - See `os.walk` for the docs. Differences: - - no support for symlinks - """ - for root, dirs, files in os.walk( - top, topdown=topdown, onerror=onerror - ): - yield os.path.normpath(root), dirs, files - - def isexec(self, path): - mode = os.stat(path).st_mode - return mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) - - @staticmethod - def stat(path): - return os.stat(path) - - @cached_property - def hash_jobs(self): - return max(1, min(4, cpu_count() // 2)) - - def makedirs(self, path, mode=0o777, exist_ok=True): - os.makedirs(path, mode=mode, exist_ok=exist_ok) - - def is_working_tree(tree): - return isinstance(tree, WorkingTree) or isinstance( - getattr(tree, "tree", None), WorkingTree + from dvc.tree.local import LocalRemoteTree + + return isinstance(tree, LocalRemoteTree) or isinstance( + getattr(tree, "tree", None), LocalRemoteTree ) diff --git a/dvc/state.py b/dvc/state.py index 257c09fd10..7c41bb01e0 100644 --- a/dvc/state.py +++ b/dvc/state.py @@ -89,10 +89,12 @@ class State: # pylint: disable=too-many-instance-attributes MAX_UINT = 2 ** 64 - 2 def __init__(self, local_cache): + from dvc.tree.local import LocalRemoteTree + repo = local_cache.repo self.repo = repo self.root_dir = repo.root_dir - self.tree = local_cache.tree.work_tree + self.tree = LocalRemoteTree(None, {}) state_config = repo.config.get("state", {}) self.row_limit = state_config.get("row_limit", self.STATE_ROW_LIMIT) @@ -394,8 +396,8 @@ def get(self, path_info): assert isinstance(path_info, str) or path_info.scheme == "local" path = os.fspath(path_info) - # NOTE: use os.path.exists instead of WorkingTree.exists - # WorkingTree.exists uses lexists() and will return True for broken + # NOTE: use os.path.exists instead of LocalRemoteTree.exists + # because it uses lexists() and will return True for broken # symlinks that we cannot stat() in get_mtime_and_size if not os.path.exists(path): return None diff --git a/dvc/tree/local.py b/dvc/tree/local.py index cfa3e3b77c..6ddcf913a3 100644 --- a/dvc/tree/local.py +++ b/dvc/tree/local.py @@ -3,13 +3,11 @@ import os import stat -from funcy import cached_property from shortuuid import uuid from dvc.exceptions import DvcException from dvc.path_info import PathInfo from dvc.scheme import Schemes -from dvc.scm.tree import WorkingTree, is_working_tree from dvc.system import System from dvc.utils import file_md5, relpath, tmp_fname from dvc.utils.fs import ( @@ -42,21 +40,16 @@ def __init__(self, repo, config): url = config.get("url") self.path_info = self.PATH_CLS(url) if url else None + @property + def tree_root(self): + return self.config.get("url") + @property def state(self): from dvc.state import StateNoop return self.repo.state if self.repo else StateNoop() - @cached_property - def work_tree(self): - # When using repo.brancher, repo.tree may change to/from WorkingTree to - # GitTree arbitarily. When repo.tree is GitTree, local cache needs to - # use its own WorkingTree instance. - if self.repo: - return WorkingTree(self.repo.root_dir) - return None - @staticmethod def open(path_info, mode="r", encoding=None): return open(path_info, mode=mode, encoding=encoding) @@ -65,26 +58,39 @@ def exists(self, path_info): assert isinstance(path_info, str) or path_info.scheme == "local" if not self.repo: return os.path.exists(path_info) - return self.work_tree.exists(path_info) + return os.path.lexists(path_info) def isfile(self, path_info): if not self.repo: return os.path.isfile(path_info) - return self.work_tree.isfile(path_info) + return os.path.isfile(path_info) def isdir(self, path_info): if not self.repo: return os.path.isdir(path_info) - return self.work_tree.isdir(path_info) + return os.path.isdir(path_info) def iscopy(self, path_info): return not ( System.is_symlink(path_info) or System.is_hardlink(path_info) ) + def walk(self, top, topdown=True, onerror=None): + """Directory tree generator. + + See `os.walk` for the docs. Differences: + - no support for symlinks + """ + for root, dirs, files in os.walk( + top, topdown=topdown, onerror=onerror + ): + yield os.path.normpath(root), dirs, files + def walk_files(self, path_info, **kwargs): - for fname in self.work_tree.walk_files(path_info): - yield PathInfo(fname) + for root, _, files in self.walk(path_info): + for file in files: + # NOTE: os.path.join is ~5.5 times slower + yield PathInfo(f"{root}{os.sep}{file}") def is_empty(self, path_info): path = path_info.fspath @@ -111,6 +117,14 @@ def remove(self, path_info): def makedirs(self, path_info): makedirs(path_info, exist_ok=True, mode=self.dir_mode) + def isexec(self, path): + mode = os.stat(path).st_mode + return mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) + + @staticmethod + def stat(path): + return os.stat(path) + def move(self, from_info, to_info, mode=None): if from_info.scheme != "local" or to_info.scheme != "local": raise NotImplementedError @@ -215,9 +229,7 @@ def _unprotect_file(self, path): os.chmod(path, self.file_mode) def _unprotect_dir(self, path): - assert is_working_tree(self.repo.tree) - - for fname in self.repo.tree.walk_files(path): + for fname in self.walk_files(path): self._unprotect_file(fname) def unprotect(self, path_info): diff --git a/dvc/utils/fs.py b/dvc/utils/fs.py index b48d0af4a9..637f2b964b 100644 --- a/dvc/utils/fs.py +++ b/dvc/utils/fs.py @@ -44,7 +44,7 @@ def get_mtime_and_size(path, tree): raise continue size += stats.st_size - files_mtimes[file_path] = stats.st_mtime + files_mtimes[os.fspath(file_path)] = stats.st_mtime # We track file changes and moves, which cannot be detected with simply # max(mtime(f) for f in non_ignored_files) diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index c9b9feea14..31b66e047f 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -11,8 +11,9 @@ DvcIgnorePatternsTrie, DvcIgnoreRepo, ) +from dvc.path_info import PathInfo from dvc.repo import Repo -from dvc.scm.tree import WorkingTree +from dvc.tree.local import LocalRemoteTree from dvc.utils import relpath from dvc.utils.fs import get_mtime_and_size from tests.dir_helpers import TmpDir @@ -107,7 +108,8 @@ def test_ignore_collecting_dvcignores(tmp_dir, dvc, dname): assert ignore_pattern_trie is not None assert ( DvcIgnorePatterns.from_files( - os.fspath(top_ignore_file), WorkingTree(dvc.root_dir) + os.fspath(top_ignore_file), + LocalRemoteTree(None, {"url": dvc.root_dir}), ) == ignore_pattern_trie[os.fspath(ignore_file)] ) @@ -165,7 +167,7 @@ def test_ignore_subrepo(tmp_dir, scm, dvc): scm.commit("init parent dvcignore") subrepo_dir = tmp_dir / "subdir" - assert not dvc.tree.exists(subrepo_dir / "foo") + assert not dvc.tree.exists(PathInfo(subrepo_dir / "foo")) with subrepo_dir.chdir(): subrepo = Repo.init(subdir=True) @@ -173,7 +175,7 @@ def test_ignore_subrepo(tmp_dir, scm, dvc): scm.commit("subrepo init") for _ in subrepo.brancher(all_commits=True): - assert subrepo.tree.exists(subrepo_dir / "foo") + assert subrepo.tree.exists(PathInfo(subrepo_dir / "foo")) def test_ignore_blank_line(tmp_dir, dvc): diff --git a/tests/func/test_tree.py b/tests/func/test_tree.py index 2a6b8ba823..f6e6d7021e 100644 --- a/tests/func/test_tree.py +++ b/tests/func/test_tree.py @@ -7,20 +7,20 @@ from dvc.repo.tree import RepoTree from dvc.scm import SCM from dvc.scm.git import GitTree -from dvc.scm.tree import WorkingTree +from dvc.tree.local import LocalRemoteTree from dvc.utils.fs import remove from tests.basic_env import TestDir, TestGit, TestGitSubmodule -class TestWorkingTree(TestDir): +class TestLocalRemoteTree(TestDir): def setUp(self): super().setUp() - self.tree = WorkingTree() + self.tree = LocalRemoteTree(None, {}) def test_open(self): with self.tree.open(self.FOO) as fd: self.assertEqual(fd.read(), self.FOO_CONTENTS) - with self.tree.open(self.UNICODE) as fd: + with self.tree.open(self.UNICODE, encoding="utf-8") as fd: self.assertEqual(fd.read(), self.UNICODE_CONTENTS) def test_exists(self): @@ -109,7 +109,7 @@ def convert_to_sets(walk_results): class TestWalkInNoSCM(AssertWalkEqualMixin, TestDir): def test(self): - tree = WorkingTree(self._root_dir) + tree = LocalRemoteTree(None, {"url": self._root_dir}) self.assertWalkEqual( tree.walk(self._root_dir), [ @@ -128,7 +128,7 @@ def test(self): ) def test_subdir(self): - tree = WorkingTree(self._root_dir) + tree = LocalRemoteTree(None, {"url": self._root_dir}) self.assertWalkEqual( tree.walk(join("data_dir", "data_sub_dir")), [(join("data_dir", "data_sub_dir"), [], ["data_sub"])], @@ -137,7 +137,7 @@ def test_subdir(self): class TestWalkInGit(AssertWalkEqualMixin, TestGit): def test_nobranch(self): - tree = CleanTree(WorkingTree(self._root_dir)) + tree = CleanTree(LocalRemoteTree(None, {"url": self._root_dir})) self.assertWalkEqual( tree.walk("."), [ @@ -230,14 +230,16 @@ def test_cleantree_subrepo(tmp_dir, dvc, scm, monkeypatch): subrepo = Repo.init(subdir=True) subrepo_dir.gen({"foo": "foo", "dir": {"bar": "bar"}}) + path = PathInfo(subrepo_dir) + assert isinstance(dvc.tree, CleanTree) - assert not dvc.tree.exists(subrepo_dir / "foo") - assert not dvc.tree.isfile(subrepo_dir / "foo") - assert not dvc.tree.exists(subrepo_dir / "dir") - assert not dvc.tree.isdir(subrepo_dir / "dir") + assert not dvc.tree.exists(path / "foo") + assert not dvc.tree.isfile(path / "foo") + assert not dvc.tree.exists(path / "dir") + assert not dvc.tree.isdir(path / "dir") assert isinstance(subrepo.tree, CleanTree) - assert subrepo.tree.exists(subrepo_dir / "foo") - assert subrepo.tree.isfile(subrepo_dir / "foo") - assert subrepo.tree.exists(subrepo_dir / "dir") - assert subrepo.tree.isdir(subrepo_dir / "dir") + assert subrepo.tree.exists(path / "foo") + assert subrepo.tree.isfile(path / "foo") + assert subrepo.tree.exists(path / "dir") + assert subrepo.tree.isdir(path / "dir") diff --git a/tests/unit/utils/test_fs.py b/tests/unit/utils/test_fs.py index 2a5cad96be..c78e4a2a47 100644 --- a/tests/unit/utils/test_fs.py +++ b/tests/unit/utils/test_fs.py @@ -8,8 +8,8 @@ import dvc from dvc.ignore import CleanTree from dvc.path_info import PathInfo -from dvc.scm.tree import WorkingTree from dvc.system import System +from dvc.tree.local import LocalRemoteTree from dvc.utils import relpath from dvc.utils.fs import ( BasePathNotInCheckedPathException, @@ -29,7 +29,7 @@ class TestMtimeAndSize(TestDir): def test(self): - tree = CleanTree(WorkingTree(self.root_dir)) + tree = CleanTree(LocalRemoteTree(None, {"url": self.root_dir})) file_time, file_size = get_mtime_and_size(self.DATA, tree) dir_time, dir_size = get_mtime_and_size(self.DATA_DIR, tree) @@ -130,7 +130,7 @@ def test_path_object_and_str_are_valid_types_get_mtime_and_size(tmp_dir): tmp_dir.gen( {"dir": {"dir_file": "dir file content"}, "file": "file_content"} ) - tree = CleanTree(WorkingTree(tmp_dir)) + tree = CleanTree(LocalRemoteTree(None, {"url": os.fspath(tmp_dir)})) time, size = get_mtime_and_size("dir", tree) object_time, object_size = get_mtime_and_size(PathInfo("dir"), tree)