From 59a143eb08f91c1f4a31ea55bd1ad833fb00c644 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Thu, 25 Jun 2020 16:11:39 +0300 Subject: [PATCH] list: add support for tracked directories (#4108) * tree: walk: dynamically load directories as we go This allows us to save time by not loading directories that we won't be using anyway. This also allows us to list directories for which we don't need the access to remote. * list: add support for tracked directories --- dvc/repo/ls.py | 6 ++--- dvc/repo/tree.py | 56 ++++++++++++++++++++++++------------------- tests/func/test_ls.py | 27 +++++++++++++++++++++ 3 files changed, 62 insertions(+), 27 deletions(-) diff --git a/dvc/repo/ls.py b/dvc/repo/ls.py index 9344e8b10d..e02d52116a 100644 --- a/dvc/repo/ls.py +++ b/dvc/repo/ls.py @@ -53,9 +53,9 @@ def _ls(repo, path_info, recursive=None, dvc_only=False): def onerror(exc): raise exc - # use our own RepoTree instance instead of repo.repo_tree since we do not - # want fetch/stream enabled for ls - tree = RepoTree(repo) + # use our own RepoTree instance instead of repo.repo_tree since we want to + # fetch directory listings, but don't want to fetch file contents. + tree = RepoTree(repo, stream=True) ret = {} try: diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index 175d80f95a..23cf58a0fa 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -137,10 +137,36 @@ def isfile(self, path): return not self.isdir(path) - def _walk(self, root, trie, topdown=True): + def _add_dir(self, top, trie, out, download_callback=None, **kwargs): + if not self.fetch and not self.stream: + return + + # pull dir cache if needed + dir_cache = out.get_dir_cache(**kwargs) + + # pull dir contents if needed + if self.fetch: + if out.changed_cache(filter_info=top): + used_cache = out.get_used_cache(filter_info=top) + downloaded = self.repo.cloud.pull(used_cache, **kwargs) + if download_callback: + download_callback(downloaded) + + for entry in dir_cache: + entry_relpath = entry[out.remote.tree.PARAM_RELPATH] + if os.name == "nt": + entry_relpath = entry_relpath.replace("/", os.sep) + path_info = out.path_info / entry_relpath + trie[path_info.parts] = None + + def _walk(self, root, trie, topdown=True, **kwargs): dirs = set() files = [] + out = trie.get(root.parts) + if out and out.is_dir_checksum: + self._add_dir(root, trie, out, **kwargs) + root_len = len(root.parts) for key, out in trie.iteritems(prefix=root.parts): # noqa: B301 if key == root.parts: @@ -160,9 +186,7 @@ def _walk(self, root, trie, topdown=True): for dname in dirs: yield from self._walk(root / dname, trie) - def walk( - self, top, topdown=True, onerror=None, download_callback=None, **kwargs - ): + def walk(self, top, topdown=True, onerror=None, **kwargs): from pygtrie import Trie assert topdown @@ -185,26 +209,10 @@ def walk( for out in outs: trie[out.path_info.parts] = out - if out.is_dir_checksum and (self.fetch or self.stream): - # pull dir cache if needed - dir_cache = out.get_dir_cache(**kwargs) - - # pull dir contents if needed - if self.fetch: - if out.changed_cache(filter_info=top): - used_cache = out.get_used_cache(filter_info=top) - downloaded = self.repo.cloud.pull(used_cache, **kwargs) - if download_callback: - download_callback(downloaded) - - for entry in dir_cache: - entry_relpath = entry[out.remote.tree.PARAM_RELPATH] - if os.name == "nt": - entry_relpath = entry_relpath.replace("/", os.sep) - path_info = out.path_info / entry_relpath - trie[path_info.parts] = None - - yield from self._walk(root, trie, topdown=topdown) + if out.is_dir_checksum and root.isin_or_eq(out.path_info): + self._add_dir(top, trie, out, **kwargs) + + yield from self._walk(root, trie, topdown=topdown, **kwargs) def isdvc(self, path, **kwargs): try: diff --git a/tests/func/test_ls.py b/tests/func/test_ls.py index bdf1c8cc0b..14d57acf15 100644 --- a/tests/func/test_ls.py +++ b/tests/func/test_ls.py @@ -445,3 +445,30 @@ def test_ls_shows_pipeline_tracked_outs(tmp_dir, dvc, scm, run_copy): files = Repo.ls(os.curdir, dvc_only=True) match_files(files, ((("bar",), True),)) + + +def test_ls_granular(erepo_dir): + with erepo_dir.chdir(): + erepo_dir.dvc_gen( + { + "dir": { + "1": "1", + "2": "2", + "subdir": {"foo": "foo", "bar": "bar"}, + } + }, + commit="create dir", + ) + + entries = Repo.ls(os.fspath(erepo_dir), os.path.join("dir", "subdir")) + assert entries == [ + {"isout": False, "isdir": False, "isexec": False, "path": "bar"}, + {"isout": False, "isdir": False, "isexec": False, "path": "foo"}, + ] + + entries = Repo.ls(os.fspath(erepo_dir), "dir") + assert entries == [ + {"isout": False, "isdir": False, "isexec": False, "path": "1"}, + {"isout": False, "isdir": False, "isexec": False, "path": "2"}, + {"isout": False, "isdir": True, "isexec": False, "path": "subdir"}, + ]