diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index 4e1598d204..f76344f119 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -50,13 +50,24 @@ def _make_repo(self, *, locked=True): return external_repo(d["url"], rev=rev) def _get_checksum(self, locked=True): + from dvc.repo.tree import RepoTree + with self._make_repo(locked=locked) as repo: try: return repo.find_out_by_relpath(self.def_path).info["md5"] except OutputNotFoundError: path = PathInfo(os.path.join(repo.root_dir, self.def_path)) + + # we want stream but not fetch, so DVC out directories are + # walked, but dir contents is not fetched + tree = RepoTree(repo, stream=True) + # We are polluting our repo cache with some dir listing here - return self.repo.cache.local.get_checksum(path) + if tree.isdir(path): + return self.repo.cache.local.get_dir_checksum( + path, tree=tree + ) + return tree.get_file_checksum(path) def status(self): current_checksum = self._get_checksum(locked=True) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index ab36047861..d66dfd5510 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -227,7 +227,7 @@ def _collect_dir(self, path_info, tree=None, save_tree=False, **kwargs): file_infos.add(fname) if tree: - checksums = {fi: tree.get_checksum(fi) for fi in file_infos} + checksums = {fi: tree.get_file_checksum(fi) for fi in file_infos} if save_tree: for fi, checksum in checksums.items(): self._save_file(fi, checksum, tree=tree, **kwargs) @@ -259,11 +259,14 @@ def _collect_dir(self, path_info, tree=None, save_tree=False, **kwargs): # Sorting the list by path to ensure reproducibility return sorted(result, key=itemgetter(self.PARAM_RELPATH)) - def get_dir_checksum(self, path_info): + def get_dir_checksum(self, path_info, tree=None): if not self.cache: raise RemoteCacheRequiredError(path_info) - dir_info = self._collect_dir(path_info) + dir_info = self._collect_dir(path_info, tree=None) + if tree: + # don't save state entry for path_info if it is a tree path + path_info = None return self._save_dir_info(dir_info, path_info) def _save_dir_info(self, dir_info, path_info=None):