Skip to content

Commit

Permalink
diff: group by change state and support dirs
Browse files Browse the repository at this point in the history
  • Loading branch information
Mr. Outis committed Jan 26, 2020
1 parent 07abd1b commit 5ace747
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 89 deletions.
80 changes: 36 additions & 44 deletions dvc/repo/diff.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,24 @@
import os
import collections
import os

from dvc.repo import locked
from dvc.compat import fspath


Diffable = collections.namedtuple("Diffable", "filename, checksum, size")
Diffable = collections.namedtuple("Diffable", "filename, checksum")
Diffable.__doc__ = "Common interface to compare outputs."
Diffable._asdict = lambda x: {"checksum": x.checksum, "size": x.size}


def diffable_from_output(output):
try:
size = os.path.getsize(fspath(output.path_info))
except FileNotFoundError:
size = None

return Diffable(filename=str(output), checksum=output.checksum, size=size)

Diffable.__eq__ = lambda self, other: self.filename == other.filename
Diffable.__hash__ = lambda self: hash(self.filename)

def compare_states(old, new):
if old and new:
try:
size = new["size"] - old["size"]
except KeyError:
size = "unknown"
return {"status": "modified", "size": size}

if old and not new:
return {"status": "deleted", "size": old.get("size", "unknown")}
def diffables_from_output(output):
if not output.is_dir_checksum:
return [Diffable(str(output), output.checksum)]

if not old and new:
return {"status": "added", "size": new.get("size", "unknown")}
# Unpack directory and include its outputs in the result
return [Diffable(os.path.join(str(output), ""), output.checksum)] + [
Diffable(str(output.path_info / entry["relpath"]), entry["md5"])
for entry in output.dir_cache
]


@locked
Expand All @@ -40,39 +27,44 @@ def diff(self, a_ref="HEAD", b_ref=None, *, target=None):
By default, it compares the working tree with the last commit's tree.
When a `target` path is given, it only shows that file's comparison.
It will silently fail when `target` is not found in any of the trees.
This implementation differs from `git diff` since DVC doesn't have
the concept of `index`, `dvc diff` would be the same as `dvc diff HEAD`.
the concept of `index`, but it keeps the same interface, thus,
`dvc diff` would be the same as `dvc diff HEAD`.
"""
outs = {}

for branch in self.brancher(revs=[a_ref, b_ref]):
outs[branch] = set(
diffable_from_output(out)
diffable
for stage in self.stages
for out in stage.outs
for diffable in diffables_from_output(out)
if not target or target == str(out)
)

old = outs[a_ref]
new = outs[b_ref or "working tree"]
delta = old ^ new

if target:
delta = set(x for x in delta if x.filename == target)

if not delta:
return

result = collections.defaultdict(dict)
added = new - old
deleted = old - new
delta = old ^ new

for entry in delta:
states = result[entry.filename]
states.update(
old=states.get("old") or (entry._asdict() if entry in old else {}),
new=states.get("new") or (entry._asdict() if entry in new else {}),
result = {
"added": [entry._asdict() for entry in sorted(added)],
"deleted": [entry._asdict() for entry in sorted(deleted)],
"modified": [],
}

for _old, _new in zip(sorted(old - delta), sorted(new - delta)):
if _old.checksum == _new.checksum:
continue

result["modified"].append(
{
"filename": _new.filename,
"checksum": {"old": _old.checksum, "new": _new.checksum},
}
)

for filename, entry in result.items():
entry.update({"diff": compare_states(entry["old"], entry["new"])})

return result
143 changes: 98 additions & 45 deletions tests/func/test_diff.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import pytest
import hashlib
import os
import pytest


def _checksum(text):
def digest(text):
return hashlib.md5(bytes(text, "utf-8")).hexdigest()


Expand All @@ -16,11 +17,9 @@ def test_added(tmp_dir, scm, dvc):
tmp_dir.dvc_gen("file", "text")

result = {
"file": {
"old": {},
"new": {"checksum": _checksum("text"), "size": 4},
"diff": {"status": "added", "size": 4},
}
"added": [{"filename": "file", "checksum": digest("text")}],
"deleted": [],
"modified": [],
}

assert result == dvc.diff()
Expand All @@ -31,11 +30,9 @@ def test_deleted(tmp_dir, scm, dvc):
(tmp_dir / "file.dvc").unlink()

result = {
"file": {
"old": {"checksum": _checksum("text"), "size": 4},
"new": {},
"diff": {"status": "deleted", "size": 4},
}
"added": [],
"deleted": [{"filename": "file", "checksum": digest("text")}],
"modified": [],
}

assert result == dvc.diff()
Expand All @@ -46,11 +43,14 @@ def test_modified(tmp_dir, scm, dvc):
tmp_dir.dvc_gen("file", "second")

result = {
"file": {
"old": {"checksum": _checksum("first"), "size": 6},
"new": {"checksum": _checksum("second"), "size": 6},
"diff": {"status": "modified", "size": 0},
}
"added": [],
"deleted": [],
"modified": [
{
"filename": "file",
"checksum": {"old": digest("first"), "new": digest("second")},
}
],
}

assert result == dvc.diff()
Expand All @@ -61,24 +61,24 @@ def test_refs(tmp_dir, scm, dvc):
tmp_dir.dvc_gen("file", "second", commit="second version")
tmp_dir.dvc_gen("file", "third", commit="third version")

HEAD_2 = _checksum("first")
HEAD_1 = _checksum("second")
HEAD = _checksum("third")
HEAD_2 = digest("first")
HEAD_1 = digest("second")
HEAD = digest("third")

assert dvc.diff("HEAD~1") == {
"file": {
"old": {"checksum": HEAD_1, "size": 5},
"new": {"checksum": HEAD, "size": 5},
"diff": {"status": "modified", "size": 0},
}
"added": [],
"deleted": [],
"modified": [
{"filename": "file", "checksum": {"old": HEAD_1, "new": HEAD}}
],
}

assert dvc.diff("HEAD~1", "HEAD~2") == {
"file": {
"old": {"checksum": HEAD_2, "size": 5},
"new": {"checksum": HEAD_1, "size": 5},
"diff": {"status": "modified", "size": 0},
}
assert dvc.diff("HEAD~2", "HEAD~1") == {
"added": [],
"deleted": [],
"modified": [
{"filename": "file", "checksum": {"old": HEAD_2, "new": HEAD_1}}
],
}

pytest.skip('TODO: test dvc.diff("missing")')
Expand All @@ -96,28 +96,81 @@ def test_target(tmp_dir, scm, dvc):
scm.commit("uppercase")

assert dvc.diff("HEAD~1", target="foo") == {
"foo": {
"old": {"checksum": _checksum("foo"), "size": 3},
"new": {"checksum": _checksum("FOO"), "size": 3},
"diff": {"status": "modified", "size": 0},
}
"added": [],
"deleted": [],
"modified": [
{
"filename": "foo",
"checksum": {"old": digest("foo"), "new": digest("FOO")},
}
],
}

assert not dvc.diff("HEAD~1", target="missing")
assert dvc.diff("HEAD~1", target="missing") == {
"added": [],
"deleted": [],
"modified": [],
}


def test_directories(tmp_dir, scm, dvc):
tmp_dir.dvc_gen({"dir": {"1": "1", "2": "2"}}, commit="add a directory")
tmp_dir.dvc_gen({"dir": {"2": "2"}}, commit="delete a file")
tmp_dir.dvc_gen({"dir": {"3": "3"}}, commit="add a file")
tmp_dir.dvc_gen({"dir": {"2": "two"}}, commit="modify a file")
tmp_dir.dvc_gen({"dir": {"2": "two", "3": "3"}}, commit="add a file")

# dvc.diff(":/directory", ":/init") --> (add: dir/1, dir/2)
# dvc.diff(":/delete", ":/directory") --> (deleted: dir/1)
# dvc.diff(":/modify", ":/directory") --> (modified: dir/2)
# dvc.diff(":/modify", ":/delete") --> (add: dir/3)
# dvc.diff(":/add a file", ":/modify")
pytest.skip("TODO: define output structure")
(tmp_dir / "dir" / "2").unlink()
dvc.add("dir")
scm.add("dir.dvc")
scm.commit("delete a file")

assert dvc.diff(":/init", ":/directory") == {
"added": [
{
"filename": "dir/",
"checksum": "5fb6b29836c388e093ca0715c872fe2a.dir",
},
{"filename": os.path.join("dir", "1"), "checksum": digest("1")},
{"filename": "dir/2", "checksum": digest("2")},
],
"deleted": [],
"modified": [],
}

assert dvc.diff(":/directory", ":/modify") == {
"added": [
{"filename": os.path.join("dir", "3"), "checksum": digest("3")}
],
"deleted": [],
"modified": [
{
"filename": os.path.join("dir", ""),
"checksum": {
"old": "5fb6b29836c388e093ca0715c872fe2a.dir",
"new": "9b5faf37366b3370fd98e3e60ca439c1.dir",
},
},
{
"filename": os.path.join("dir", "2"),
"checksum": {"old": digest("2"), "new": digest("two")},
},
],
}

assert dvc.diff(":/modify", ":/delete") == {
"added": [],
"deleted": [
{"filename": os.path.join("dir", "2"), "checksum": digest("two")}
],
"modified": [
{
"filename": os.path.join("dir", ""),
"checksum": {
"old": "9b5faf37366b3370fd98e3e60ca439c1.dir",
"new": "83ae82fb367ac9926455870773ff09e6.dir",
},
}
],
}


def test_cli(tmp_dir, scm, dvc):
Expand Down

0 comments on commit 5ace747

Please sign in to comment.