diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index 75a10877bd..2fa261683d 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -1,7 +1,11 @@ +import logging import os from dvc.repo import locked from dvc.tree.local import LocalTree +from dvc.tree.repo import RepoTree + +logger = logging.getLogger(__name__) @locked @@ -14,59 +18,20 @@ def diff(self, a_rev="HEAD", b_rev=None): `dvc diff` would be the same as `dvc diff HEAD`. """ - def _paths_checksums(): - """ - A dictionary of checksums addressed by relpaths collected from - the current tree outputs. - - To help distinguish between a directory and a file output, - the former one will come with a trailing slash in the path: - - directory: "data/" - file: "data" - """ - - def _to_path(output): - return ( - str(output) - if not output.is_dir_checksum - else os.path.join(str(output), "") - ) - - on_working_tree = isinstance(self.tree, LocalTree) - - def _to_checksum(output): - if on_working_tree: - return self.cache.local.tree.get_hash(output.path_info).value - return output.hash_info.value - - def _exists(output): - if on_working_tree: - return output.exists - return True - - return { - _to_path(output): _to_checksum(output) - for stage in self.stages - for output in stage.outs - if _exists(output) - } - if self.scm.no_commits: return {} - working_tree = self.tree - a_tree = self.scm.get_tree(a_rev) - b_tree = self.scm.get_tree(b_rev) if b_rev else working_tree - - try: - self.tree = a_tree - old = _paths_checksums() + b_rev = b_rev if b_rev else "workspace" + results = {} + for rev in self.brancher(revs=[a_rev, b_rev]): + if rev == "workspace" and rev != b_rev: + # brancher always returns workspace, but we only need to compute + # workspace paths/checksums if b_rev was None + continue + results[rev] = _paths_checksums(self) - self.tree = b_tree - new = _paths_checksums() - finally: - self.tree = working_tree + old = results[a_rev] + new = results[b_rev] # Compare paths between the old and new tree. # set() efficiently converts dict keys to a set @@ -85,3 +50,57 @@ def _exists(output): } return ret if any(ret.values()) else {} + + +def _paths_checksums(repo): + """ + A dictionary of checksums addressed by relpaths collected from + the current tree outputs. + + To help distinguish between a directory and a file output, + the former one will come with a trailing slash in the path: + + directory: "data/" + file: "data" + """ + + return dict(_output_paths(repo)) + + +def _output_paths(repo): + repo_tree = RepoTree(repo, stream=True) + on_working_tree = isinstance(repo.tree, LocalTree) + + def _exists(output): + if on_working_tree: + return output.exists + return True + + def _to_path(output): + return ( + str(output) + if not output.is_dir_checksum + else os.path.join(str(output), "") + ) + + def _to_checksum(output): + if on_working_tree: + return repo.cache.local.tree.get_hash(output.path_info).value + return output.hash_info.value + + for stage in repo.stages: + for output in stage.outs: + if _exists(output): + yield _to_path(output), _to_checksum(output) + if output.is_dir_checksum: + yield from _dir_output_paths(repo_tree, output) + + +def _dir_output_paths(repo_tree, output): + from dvc.config import NoRemoteError + + try: + for fname in repo_tree.walk_files(output.path_info): + yield str(fname), repo_tree.get_hash(fname).value + except NoRemoteError: + logger.warning("dir cache entry for '%s' is missing", output) diff --git a/dvc/tree/dvc.py b/dvc/tree/dvc.py index 520ee42bcc..6cbc1efe6d 100644 --- a/dvc/tree/dvc.py +++ b/dvc/tree/dvc.py @@ -133,6 +133,9 @@ def check_isdir(self, path_info, outs): self._get_granular_checksum(path_info, out) return False except FileNotFoundError: + # path may be an untracked file from a dirty workspace + if self.repo.tree.isfile(path_info): + return False return True def isfile(self, path): # pylint: disable=arguments-differ @@ -258,10 +261,16 @@ def get_file_hash(self, path_info): raise OutputNotFoundError out = outs[0] if out.is_dir_checksum: - return HashInfo( - out.tree.PARAM_CHECKSUM, - self._get_granular_checksum(path_info, out), - ) + try: + return HashInfo( + out.tree.PARAM_CHECKSUM, + self._get_granular_checksum(path_info, out), + ) + except FileNotFoundError: + # path may be an untracked file from a dirty workspace + if self.repo.tree.isfile(path_info): + return self.repo.tree.get_file_hash(path_info) + raise return out.hash_info def metadata(self, path_info): diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index 0cd45966d8..1eda16f7a4 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -43,7 +43,11 @@ def test_no_cache_entry(tmp_dir, scm, dvc): dir_checksum = "5fb6b29836c388e093ca0715c872fe2a.dir" assert dvc.diff() == { - "added": [{"path": os.path.join("dir", ""), "hash": dir_checksum}], + "added": [ + {"path": os.path.join("dir", ""), "hash": dir_checksum}, + {"path": os.path.join("dir", "1"), "hash": digest("1")}, + {"path": os.path.join("dir", "2"), "hash": digest("2")}, + ], "deleted": [], "modified": [ { @@ -125,13 +129,15 @@ def test_directories(tmp_dir, scm, dvc): "path": os.path.join("dir", ""), "hash": "5fb6b29836c388e093ca0715c872fe2a.dir", }, + {"path": os.path.join("dir", "1"), "hash": digest("1")}, + {"path": os.path.join("dir", "2"), "hash": digest("2")}, ], "deleted": [], "modified": [], } assert dvc.diff(":/directory", ":/modify") == { - "added": [], + "added": [{"path": os.path.join("dir", "3"), "hash": digest("3")}], "deleted": [], "modified": [ { @@ -141,12 +147,18 @@ def test_directories(tmp_dir, scm, dvc): "new": "9b5faf37366b3370fd98e3e60ca439c1.dir", }, }, + { + "path": os.path.join("dir", "2"), + "hash": {"old": digest("2"), "new": digest("two")}, + }, ], } assert dvc.diff(":/modify", ":/delete") == { "added": [], - "deleted": [], + "deleted": [ + {"path": os.path.join("dir", "2"), "hash": digest("two")}, + ], "modified": [ { "path": os.path.join("dir", ""), @@ -193,7 +205,11 @@ def test_diff_dirty(tmp_dir, scm, dvc): assert result == { "added": [ - {"hash": "86d049de17c76ac44cdcac146042ec9b", "path": "new_file"} + { + "hash": digest("dir file 2 content"), + "path": os.path.join("dir", "dir_file2"), + }, + {"hash": "86d049de17c76ac44cdcac146042ec9b", "path": "new_file"}, ], "deleted": [ {"hash": "7f0b6bb0b7e951b7fd2b2a4a326297e1", "path": "file"}