From 5ac0a50a4caef02bdb1f5cbd47f4b1a952f4a9ea Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Wed, 24 Jun 2020 22:17:25 +0300 Subject: [PATCH 1/2] tree: walk: dynamically load directories as we go This allows us to save time by not loading directories that we won't be using anyway. This also allows us to list directories for which we don't need the access to remote. --- dvc/repo/tree.py | 56 +++++++++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index 175d80f95a..23cf58a0fa 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -137,10 +137,36 @@ def isfile(self, path): return not self.isdir(path) - def _walk(self, root, trie, topdown=True): + def _add_dir(self, top, trie, out, download_callback=None, **kwargs): + if not self.fetch and not self.stream: + return + + # pull dir cache if needed + dir_cache = out.get_dir_cache(**kwargs) + + # pull dir contents if needed + if self.fetch: + if out.changed_cache(filter_info=top): + used_cache = out.get_used_cache(filter_info=top) + downloaded = self.repo.cloud.pull(used_cache, **kwargs) + if download_callback: + download_callback(downloaded) + + for entry in dir_cache: + entry_relpath = entry[out.remote.tree.PARAM_RELPATH] + if os.name == "nt": + entry_relpath = entry_relpath.replace("/", os.sep) + path_info = out.path_info / entry_relpath + trie[path_info.parts] = None + + def _walk(self, root, trie, topdown=True, **kwargs): dirs = set() files = [] + out = trie.get(root.parts) + if out and out.is_dir_checksum: + self._add_dir(root, trie, out, **kwargs) + root_len = len(root.parts) for key, out in trie.iteritems(prefix=root.parts): # noqa: B301 if key == root.parts: @@ -160,9 +186,7 @@ def _walk(self, root, trie, topdown=True): for dname in dirs: yield from self._walk(root / dname, trie) - def walk( - self, top, topdown=True, onerror=None, download_callback=None, **kwargs - ): + def walk(self, top, topdown=True, onerror=None, **kwargs): from pygtrie import Trie assert topdown @@ -185,26 +209,10 @@ def walk( for out in outs: trie[out.path_info.parts] = out - if out.is_dir_checksum and (self.fetch or self.stream): - # pull dir cache if needed - dir_cache = out.get_dir_cache(**kwargs) - - # pull dir contents if needed - if self.fetch: - if out.changed_cache(filter_info=top): - used_cache = out.get_used_cache(filter_info=top) - downloaded = self.repo.cloud.pull(used_cache, **kwargs) - if download_callback: - download_callback(downloaded) - - for entry in dir_cache: - entry_relpath = entry[out.remote.tree.PARAM_RELPATH] - if os.name == "nt": - entry_relpath = entry_relpath.replace("/", os.sep) - path_info = out.path_info / entry_relpath - trie[path_info.parts] = None - - yield from self._walk(root, trie, topdown=topdown) + if out.is_dir_checksum and root.isin_or_eq(out.path_info): + self._add_dir(top, trie, out, **kwargs) + + yield from self._walk(root, trie, topdown=topdown, **kwargs) def isdvc(self, path, **kwargs): try: From f4306b2b1d9822426b4b08cf04d687fe6846caa3 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Wed, 24 Jun 2020 22:25:38 +0300 Subject: [PATCH 2/2] list: add support for tracked directories --- dvc/repo/ls.py | 6 +++--- tests/func/test_ls.py | 27 +++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/dvc/repo/ls.py b/dvc/repo/ls.py index 9344e8b10d..e02d52116a 100644 --- a/dvc/repo/ls.py +++ b/dvc/repo/ls.py @@ -53,9 +53,9 @@ def _ls(repo, path_info, recursive=None, dvc_only=False): def onerror(exc): raise exc - # use our own RepoTree instance instead of repo.repo_tree since we do not - # want fetch/stream enabled for ls - tree = RepoTree(repo) + # use our own RepoTree instance instead of repo.repo_tree since we want to + # fetch directory listings, but don't want to fetch file contents. + tree = RepoTree(repo, stream=True) ret = {} try: diff --git a/tests/func/test_ls.py b/tests/func/test_ls.py index bdf1c8cc0b..14d57acf15 100644 --- a/tests/func/test_ls.py +++ b/tests/func/test_ls.py @@ -445,3 +445,30 @@ def test_ls_shows_pipeline_tracked_outs(tmp_dir, dvc, scm, run_copy): files = Repo.ls(os.curdir, dvc_only=True) match_files(files, ((("bar",), True),)) + + +def test_ls_granular(erepo_dir): + with erepo_dir.chdir(): + erepo_dir.dvc_gen( + { + "dir": { + "1": "1", + "2": "2", + "subdir": {"foo": "foo", "bar": "bar"}, + } + }, + commit="create dir", + ) + + entries = Repo.ls(os.fspath(erepo_dir), os.path.join("dir", "subdir")) + assert entries == [ + {"isout": False, "isdir": False, "isexec": False, "path": "bar"}, + {"isout": False, "isdir": False, "isexec": False, "path": "foo"}, + ] + + entries = Repo.ls(os.fspath(erepo_dir), "dir") + assert entries == [ + {"isout": False, "isdir": False, "isexec": False, "path": "1"}, + {"isout": False, "isdir": False, "isexec": False, "path": "2"}, + {"isout": False, "isdir": True, "isexec": False, "path": "subdir"}, + ]