diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index 3ef1d890a2..78a6bc9927 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -1,37 +1,24 @@ -import os import collections +import os from dvc.repo import locked -from dvc.compat import fspath -Diffable = collections.namedtuple("Diffable", "filename, checksum, size") +Diffable = collections.namedtuple("Diffable", "filename, checksum") Diffable.__doc__ = "Common interface to compare outputs." -Diffable._asdict = lambda x: {"checksum": x.checksum, "size": x.size} - - -def diffable_from_output(output): - try: - size = os.path.getsize(fspath(output.path_info)) - except FileNotFoundError: - size = None - - return Diffable(filename=str(output), checksum=output.checksum, size=size) - +Diffable.__eq__ = lambda self, other: self.filename == other.filename +Diffable.__hash__ = lambda self: hash(self.filename) -def compare_states(old, new): - if old and new: - try: - size = new["size"] - old["size"] - except KeyError: - size = "unknown" - return {"status": "modified", "size": size} - if old and not new: - return {"status": "deleted", "size": old.get("size", "unknown")} +def diffables_from_output(output): + if not output.is_dir_checksum: + return [Diffable(str(output), output.checksum)] - if not old and new: - return {"status": "added", "size": new.get("size", "unknown")} + # Unpack directory and include its outputs in the result + return [Diffable(os.path.join(str(output), ""), output.checksum)] + [ + Diffable(str(output.path_info / entry["relpath"]), entry["md5"]) + for entry in output.dir_cache + ] @locked @@ -40,39 +27,44 @@ def diff(self, a_ref="HEAD", b_ref=None, *, target=None): By default, it compares the working tree with the last commit's tree. When a `target` path is given, it only shows that file's comparison. - It will silently fail when `target` is not found in any of the trees. This implementation differs from `git diff` since DVC doesn't have - the concept of `index`, `dvc diff` would be the same as `dvc diff HEAD`. + the concept of `index`, but it keeps the same interface, thus, + `dvc diff` would be the same as `dvc diff HEAD`. """ outs = {} for branch in self.brancher(revs=[a_ref, b_ref]): outs[branch] = set( - diffable_from_output(out) + diffable for stage in self.stages for out in stage.outs + for diffable in diffables_from_output(out) + if not target or target == str(out) ) + old = outs[a_ref] new = outs[b_ref or "working tree"] - delta = old ^ new - - if target: - delta = set(x for x in delta if x.filename == target) - if not delta: - return - - result = collections.defaultdict(dict) + added = new - old + deleted = old - new + delta = old ^ new - for entry in delta: - states = result[entry.filename] - states.update( - old=states.get("old") or (entry._asdict() if entry in old else {}), - new=states.get("new") or (entry._asdict() if entry in new else {}), + result = { + "added": [entry._asdict() for entry in sorted(added)], + "deleted": [entry._asdict() for entry in sorted(deleted)], + "modified": [], + } + + for _old, _new in zip(sorted(old - delta), sorted(new - delta)): + if _old.checksum == _new.checksum: + continue + + result["modified"].append( + { + "filename": _new.filename, + "checksum": {"old": _old.checksum, "new": _new.checksum}, + } ) - for filename, entry in result.items(): - entry.update({"diff": compare_states(entry["old"], entry["new"])}) - return result diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index 12e56db199..889878d682 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -1,8 +1,9 @@ -import pytest import hashlib +import os +import pytest -def _checksum(text): +def digest(text): return hashlib.md5(bytes(text, "utf-8")).hexdigest() @@ -16,11 +17,9 @@ def test_added(tmp_dir, scm, dvc): tmp_dir.dvc_gen("file", "text") result = { - "file": { - "old": {}, - "new": {"checksum": _checksum("text"), "size": 4}, - "diff": {"status": "added", "size": 4}, - } + "added": [{"filename": "file", "checksum": digest("text")}], + "deleted": [], + "modified": [], } assert result == dvc.diff() @@ -31,11 +30,9 @@ def test_deleted(tmp_dir, scm, dvc): (tmp_dir / "file.dvc").unlink() result = { - "file": { - "old": {"checksum": _checksum("text"), "size": 4}, - "new": {}, - "diff": {"status": "deleted", "size": 4}, - } + "added": [], + "deleted": [{"filename": "file", "checksum": digest("text")}], + "modified": [], } assert result == dvc.diff() @@ -46,11 +43,14 @@ def test_modified(tmp_dir, scm, dvc): tmp_dir.dvc_gen("file", "second") result = { - "file": { - "old": {"checksum": _checksum("first"), "size": 6}, - "new": {"checksum": _checksum("second"), "size": 6}, - "diff": {"status": "modified", "size": 0}, - } + "added": [], + "deleted": [], + "modified": [ + { + "filename": "file", + "checksum": {"old": digest("first"), "new": digest("second")}, + } + ], } assert result == dvc.diff() @@ -61,24 +61,24 @@ def test_refs(tmp_dir, scm, dvc): tmp_dir.dvc_gen("file", "second", commit="second version") tmp_dir.dvc_gen("file", "third", commit="third version") - HEAD_2 = _checksum("first") - HEAD_1 = _checksum("second") - HEAD = _checksum("third") + HEAD_2 = digest("first") + HEAD_1 = digest("second") + HEAD = digest("third") assert dvc.diff("HEAD~1") == { - "file": { - "old": {"checksum": HEAD_1, "size": 5}, - "new": {"checksum": HEAD, "size": 5}, - "diff": {"status": "modified", "size": 0}, - } + "added": [], + "deleted": [], + "modified": [ + {"filename": "file", "checksum": {"old": HEAD_1, "new": HEAD}} + ], } - assert dvc.diff("HEAD~1", "HEAD~2") == { - "file": { - "old": {"checksum": HEAD_2, "size": 5}, - "new": {"checksum": HEAD_1, "size": 5}, - "diff": {"status": "modified", "size": 0}, - } + assert dvc.diff("HEAD~2", "HEAD~1") == { + "added": [], + "deleted": [], + "modified": [ + {"filename": "file", "checksum": {"old": HEAD_2, "new": HEAD_1}} + ], } pytest.skip('TODO: test dvc.diff("missing")') @@ -96,28 +96,81 @@ def test_target(tmp_dir, scm, dvc): scm.commit("uppercase") assert dvc.diff("HEAD~1", target="foo") == { - "foo": { - "old": {"checksum": _checksum("foo"), "size": 3}, - "new": {"checksum": _checksum("FOO"), "size": 3}, - "diff": {"status": "modified", "size": 0}, - } + "added": [], + "deleted": [], + "modified": [ + { + "filename": "foo", + "checksum": {"old": digest("foo"), "new": digest("FOO")}, + } + ], } - assert not dvc.diff("HEAD~1", target="missing") + assert dvc.diff("HEAD~1", target="missing") == { + "added": [], + "deleted": [], + "modified": [], + } def test_directories(tmp_dir, scm, dvc): tmp_dir.dvc_gen({"dir": {"1": "1", "2": "2"}}, commit="add a directory") - tmp_dir.dvc_gen({"dir": {"2": "2"}}, commit="delete a file") + tmp_dir.dvc_gen({"dir": {"3": "3"}}, commit="add a file") tmp_dir.dvc_gen({"dir": {"2": "two"}}, commit="modify a file") - tmp_dir.dvc_gen({"dir": {"2": "two", "3": "3"}}, commit="add a file") - # dvc.diff(":/directory", ":/init") --> (add: dir/1, dir/2) - # dvc.diff(":/delete", ":/directory") --> (deleted: dir/1) - # dvc.diff(":/modify", ":/directory") --> (modified: dir/2) - # dvc.diff(":/modify", ":/delete") --> (add: dir/3) - # dvc.diff(":/add a file", ":/modify") - pytest.skip("TODO: define output structure") + (tmp_dir / "dir" / "2").unlink() + dvc.add("dir") + scm.add("dir.dvc") + scm.commit("delete a file") + + assert dvc.diff(":/init", ":/directory") == { + "added": [ + { + "filename": "dir/", + "checksum": "5fb6b29836c388e093ca0715c872fe2a.dir", + }, + {"filename": os.path.join("dir", "1"), "checksum": digest("1")}, + {"filename": "dir/2", "checksum": digest("2")}, + ], + "deleted": [], + "modified": [], + } + + assert dvc.diff(":/directory", ":/modify") == { + "added": [ + {"filename": os.path.join("dir", "3"), "checksum": digest("3")} + ], + "deleted": [], + "modified": [ + { + "filename": os.path.join("dir", ""), + "checksum": { + "old": "5fb6b29836c388e093ca0715c872fe2a.dir", + "new": "9b5faf37366b3370fd98e3e60ca439c1.dir", + }, + }, + { + "filename": os.path.join("dir", "2"), + "checksum": {"old": digest("2"), "new": digest("two")}, + }, + ], + } + + assert dvc.diff(":/modify", ":/delete") == { + "added": [], + "deleted": [ + {"filename": os.path.join("dir", "2"), "checksum": digest("two")} + ], + "modified": [ + { + "filename": os.path.join("dir", ""), + "checksum": { + "old": "9b5faf37366b3370fd98e3e60ca439c1.dir", + "new": "83ae82fb367ac9926455870773ff09e6.dir", + }, + } + ], + } def test_cli(tmp_dir, scm, dvc):