From 198bf55aafad6e98c2b643e652c0b222308cb3c2 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Wed, 22 Jan 2020 12:23:11 -0600 Subject: [PATCH 01/35] diff: reimplement interface and tests from scratch --- dvc/command/diff.py | 140 ++----------------- dvc/repo/brancher.py | 2 + dvc/repo/diff.py | 266 +++++------------------------------ tests/func/test_diff.py | 299 ++++++++-------------------------------- 4 files changed, 111 insertions(+), 596 deletions(-) diff --git a/dvc/command/diff.py b/dvc/command/diff.py index 7e064adda0..74f15edf56 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -1,12 +1,7 @@ import argparse import logging -import humanize -import inflect -from funcy import compact - -from dvc.command.base import append_doc_link -from dvc.command.base import CmdBase +from dvc.command.base import CmdBase, append_doc_link from dvc.exceptions import DvcException @@ -14,122 +9,11 @@ class CmdDiff(CmdBase): - @staticmethod - def _print_size(size): - if size < 0: - change = "decreased by {}" - elif size > 0: - change = "increased by {}" - else: - change = "not changed" - natur_size = humanize.naturalsize(abs(size)) - return change.format(natur_size) - - @staticmethod - def _get_md5_string(sign, file_name, checksum): - sample_msg = "" - if file_name: - sample_msg = "{}{} with md5 {}\n" - sample_msg = sample_msg.format(sign, file_name, checksum) - return sample_msg - - @classmethod - def _get_dir_changes(cls, dct): - import dvc.repo.diff as diff - - engine = inflect.engine() - changes_msg = ( - "{} {} untouched, {} {} modified, {} {} added, " - "{} {} deleted, size was {}" - ) - changes_msg = changes_msg.format( - dct[diff.DIFF_IDENT], - engine.plural("file", dct[diff.DIFF_IDENT]), - dct[diff.DIFF_CHANGE], - engine.plural("file", dct[diff.DIFF_CHANGE]), - dct[diff.DIFF_NEW], - engine.plural("file", dct[diff.DIFF_NEW]), - dct[diff.DIFF_DEL], - engine.plural("file", dct[diff.DIFF_DEL]), - cls._print_size(dct[diff.DIFF_SIZE]), - ) - return changes_msg - - @classmethod - def _get_file_changes(cls, dct): - import dvc.repo.diff as diff - - if ( - dct.get(diff.DIFF_OLD_FILE) - and dct.get(diff.DIFF_NEW_FILE) - and dct[diff.DIFF_SIZE] == 0 - ): - msg = "file size was not changed" - elif dct.get(diff.DIFF_NEW_FILE): - msg = "added file with size {}".format( - humanize.naturalsize(dct[diff.DIFF_SIZE]) - ) - elif dct.get(diff.DIFF_OLD_FILE): - msg = "deleted file with size {}".format( - humanize.naturalsize(abs(dct[diff.DIFF_SIZE])) - ) - else: - msg = "file was modified, file size {}".format( - cls._print_size(dct[diff.DIFF_SIZE]) - ) - return msg - - @classmethod - def _get_royal_changes(cls, dct): - import dvc.repo.diff as diff - - if dct[diff.DIFF_SIZE] != diff.DIFF_SIZE_UNKNOWN: - if dct.get("is_dir"): - return cls._get_dir_changes(dct) - else: - return cls._get_file_changes(dct) - return "size is ?" - - @classmethod - def _show(cls, diff_dct): - import dvc.repo.diff as diff - - msg = "dvc diff from {} to {}".format( - diff_dct[diff.DIFF_A_REF], diff_dct[diff.DIFF_B_REF] - ) - if diff_dct.get(diff.DIFF_EQUAL): - logger.info(msg) - return - for dct in diff_dct[diff.DIFF_LIST]: - msg += "\n\ndiff for '{}'\n".format(dct[diff.DIFF_TARGET]) - msg += cls._get_md5_string( - "-", - dct.get(diff.DIFF_OLD_FILE), - dct.get(diff.DIFF_OLD_CHECKSUM), - ) - msg += cls._get_md5_string( - "+", - dct.get(diff.DIFF_NEW_FILE), - dct.get(diff.DIFF_NEW_CHECKSUM), - ) - msg += "\n" - msg += cls._get_royal_changes(dct) - logger.info(msg) - return msg - def run(self): try: - msg = self.repo.diff( - self.args.a_ref, target=self.args.target, b_ref=self.args.b_ref - ) - self._show(msg) + self.repo.diff(self.args.a_ref, self.args.b_ref, self.args.target) except DvcException: - msg = "failed to get 'diff {}'" - args = " ".join( - compact([self.args.target, self.args.a_ref, self.args.b_ref]) - ) - msg = msg.format(args) - logger.exception(msg) + logger.exception("failed to get 'diff {}'") return 1 return 0 @@ -147,15 +31,6 @@ def add_parser(subparsers, parent_parser): help=DIFF_HELP, formatter_class=argparse.RawDescriptionHelpFormatter, ) - diff_parser.add_argument( - "-t", - "--target", - help=( - "Source path to a data file or directory. Default None. " - "If not specified, compares all files and directories " - "that are under DVC control in the current working space." - ), - ) diff_parser.add_argument( "a_ref", help="Git reference from which diff calculates" ) @@ -167,4 +42,13 @@ def add_parser(subparsers, parent_parser): ), nargs="?", ) + diff_parser.add_argument( + "-t", + "--target", + help=( + "Source path to a data file or directory. Default None. " + "If not specified, compares all files and directories " + "that are under DVC control in the current working space." + ), + ) diff_parser.set_defaults(func=CmdDiff) diff --git a/dvc/repo/brancher.py b/dvc/repo/brancher.py index 955b135135..9d10901905 100644 --- a/dvc/repo/brancher.py +++ b/dvc/repo/brancher.py @@ -41,6 +41,8 @@ def brancher( # noqa: E302 if all_tags: revs.extend(scm.list_tags()) + revs = filter(None, revs) + try: if revs: for sha, names in group_by(scm.resolve_rev, revs).items(): diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index 165b286b99..dda1e0c79e 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -1,246 +1,56 @@ import os -from errno import ENOENT +import collections -from dvc import logger -from dvc.scm.base import FileNotInCommitError -from dvc.scm.git import DIFF_A_REF, DIFF_B_REF, DIFF_A_TREE, DIFF_B_TREE -from dvc.scm.git import DIFF_EQUAL from . import locked +from ..compat import fspath -DIFF_TARGET = "target" -DIFF_IS_DIR = "is_dir" -DIFF_OLD_FILE = "old_file" -DIFF_OLD_CHECKSUM = "old_checksum" -DIFF_NEW_FILE = "new_file" -DIFF_NEW_CHECKSUM = "new_checksum" -DIFF_SIZE = "size_diff" -DIFF_DEL = "del" -DIFF_IDENT = "ident" -DIFF_CHANGE = "changes" -DIFF_NEW = "new" -DIFF_MOVE = "moves" -DIFF_LIST = "diffs" -DIFF_SIZE_UNKNOWN = "?" -DIFF_A_OUTPUT = "a_output" -DIFF_B_OUTPUT = "b_output" -DIFF_DELETED = "deleted_file" -DIFF_IS_NEW = "created_file" +Diffable = collections.namedtuple("Diffable", "filename, checksum, size") +Diffable.__doc__ = "Common interface for comparable entries." -def _file_not_exists(error, result): - if error.errno == ENOENT: - result.update({DIFF_SIZE: DIFF_SIZE_UNKNOWN}) - else: - raise error - - -def _extract_dir(self, dir_not_exists, output): - """Extract the content of dvc tree file - Args: - self(object) - Repo class instance - dir_not_exists(bool) - flag for directory existence - output(object) - OutputLOCAL class instance - Returns: - dict - dictionary with keys - paths to file in .dvc/cache - values -checksums for that files - """ - if not dir_not_exists: - lst = output.dir_cache - return {i["relpath"]: i["md5"] for i in lst} - return {} - - -def _get_dir_info(dir_not_exists, a_output): - if not dir_not_exists: - return str(a_output), a_output.checksum - return "", "" - - -def _ident_files(a_entries, b_entries): - keys = [ - key for key in a_entries.keys() if b_entries.get(key) == a_entries[key] - ] - return len(keys) - - -def _modified_files(self, a_entries, b_entries): - keys = [key for key in a_entries.keys() if key in b_entries] - diff_size = 0 - modified_count = 0 - for key in keys: - if a_entries[key] != b_entries[key]: - modified_count += 1 - diff_size += os.path.getsize( - self.cache.local.get(b_entries[key]) - ) - os.path.getsize(self.cache.local.get(a_entries[key])) - return modified_count, diff_size - - -def _deleted_files(self, a_entries, b_entries): - diff_size = 0 - deleted_count = 0 - for key, value in a_entries.items(): - if key not in b_entries: - deleted_count += 1 - diff_size -= os.path.getsize(self.cache.local.get(a_entries[key])) - return deleted_count, diff_size - - -def _new_files(self, a_entries, b_entries): - diff_size = 0 - new_count = 0 - for key, value in b_entries.items(): - if key not in a_entries: - new_count += 1 - diff_size += os.path.getsize(self.cache.local.get(b_entries[key])) - return new_count, diff_size - +def diffable_from_output(output): + try: + size = os.path.getsize(fspath(output.path_info)) + except OSError: + size = None -def _get_tree_changes(self, a_entries, b_entries): - result = { - DIFF_DEL: 0, - DIFF_IDENT: 0, - DIFF_CHANGE: 0, - DIFF_NEW: 0, - DIFF_MOVE: 0, - } - result[DIFF_IDENT] = _ident_files(a_entries, b_entries) - result[DIFF_CHANGE], diff_size = _modified_files( - self, a_entries, b_entries + return Diffable( + filename=str(output.path_info), checksum=output.checksum, size=size ) - result[DIFF_SIZE] = diff_size - result[DIFF_DEL], diff_size = _deleted_files(self, a_entries, b_entries) - result[DIFF_SIZE] += diff_size - result[DIFF_NEW], diff_size = _new_files(self, a_entries, b_entries) - result[DIFF_SIZE] += diff_size - return result - - -def _check_local_cache(a_out, is_checked): - if a_out is not None and a_out.scheme != "local": - is_checked.append(str(a_out)) - return True - return False - -def _is_dir(path, a_outs, b_outs): - if a_outs.get(path): - return a_outs[path].is_dir_checksum - else: - return b_outs[path].is_dir_checksum +@locked +def diff(self, a_ref=None, b_ref=None, *, target=None): + """ + By default, it compares the working tree with the last commit's tree. -def _get_diff_outs(self, diff_dct): - self.tree = diff_dct[DIFF_A_TREE] - a_outs = {str(out): out for st in self.stages for out in st.outs} - self.tree = diff_dct[DIFF_B_TREE] - b_outs = {str(out): out for st in self.stages for out in st.outs} - outs_paths = set(a_outs.keys()) - outs_paths.update(b_outs.keys()) - results = {} - non_local_cache = [] - for path in outs_paths: - check1 = _check_local_cache(a_outs.get(path), non_local_cache) - check2 = _check_local_cache(b_outs.get(path), non_local_cache) - # skip files/directories with non-local cache for now - if check1 or check2: - continue - results[path] = {} - results[path][DIFF_A_OUTPUT] = a_outs.get(path) - results[path][DIFF_B_OUTPUT] = b_outs.get(path) - results[path][DIFF_IS_NEW] = path not in a_outs - results[path][DIFF_DELETED] = path not in b_outs - results[path][DIFF_IS_DIR] = _is_dir(path, a_outs, b_outs) - if non_local_cache: - logger.warning( - "Diff is not supported for non-local outputs. Ignoring: {}".format( - non_local_cache - ) - ) - - return results - + When a `target` path is given, it only shows that file's comparison. -def _diff_dir(self, target, diff_dct): - result = {DIFF_TARGET: target} - result[DIFF_IS_DIR] = True - a_entries, b_entries = {}, {} - try: - a_entries = _extract_dir( - self, diff_dct[DIFF_IS_NEW], diff_dct[DIFF_A_OUTPUT] - ) - b_entries = _extract_dir( - self, diff_dct[DIFF_DELETED], diff_dct[DIFF_B_OUTPUT] - ) - result[DIFF_OLD_FILE], result[DIFF_OLD_CHECKSUM] = _get_dir_info( - diff_dct[DIFF_IS_NEW], diff_dct[DIFF_A_OUTPUT] - ) - result[DIFF_NEW_FILE], result[DIFF_NEW_CHECKSUM] = _get_dir_info( - diff_dct[DIFF_DELETED], diff_dct[DIFF_B_OUTPUT] + This implementation differs from `git diff`, since DVC doesn't have + the concept of `index`, `dvc diff` would be the same as `dvc diff HEAD`. + """ + a_ref = a_ref or "HEAD" + outs = {} + + for branch in self.brancher(revs=[a_ref, b_ref]): + outs[branch] = set( + diffable_from_output(out) + for stage in self.stages + for out in stage.outs ) - result.update(_get_tree_changes(self, a_entries, b_entries)) - except IOError as e: - _file_not_exists(e, result) - return result - -def _diff_file(self, target, diff_dct): - result = {DIFF_TARGET: target} - size = 0 - try: - if not diff_dct[DIFF_IS_NEW]: - result[DIFF_OLD_FILE] = target - result[DIFF_OLD_CHECKSUM] = diff_dct[DIFF_A_OUTPUT].checksum - size -= os.path.getsize( - self.cache.local.get(diff_dct[DIFF_A_OUTPUT].checksum) - ) - if not diff_dct[DIFF_DELETED]: - result[DIFF_NEW_FILE] = target - result[DIFF_NEW_CHECKSUM] = diff_dct[DIFF_B_OUTPUT].checksum - size += os.path.getsize( - self.cache.local.get(diff_dct[DIFF_B_OUTPUT].checksum) - ) - result[DIFF_SIZE] = size - except IOError as e: - _file_not_exists(e, result) - return result + old = outs[a_ref] + new = outs[b_ref or "working tree"] + delta = old ^ new + if not delta: + return -def _diff_royal(self, target, diff_dct): - if diff_dct[DIFF_IS_DIR]: - return _diff_dir(self, target, diff_dct) - return _diff_file(self, target, diff_dct) + old &= delta + new &= delta - -@locked -def diff(self, a_ref, target=None, b_ref=None): - """Generates diff message string output - - Args: - target(str) - file/directory to check diff of - a_ref(str) - first tag - (optional) b_ref(str) - second git tag - - Returns: - string: string of output message with diff info - """ - result = {} - diff_dct = self.scm.get_diff_trees(a_ref, b_ref=b_ref) - result[DIFF_A_REF] = diff_dct[DIFF_A_REF] - result[DIFF_B_REF] = diff_dct[DIFF_B_REF] - if diff_dct[DIFF_EQUAL]: - result[DIFF_EQUAL] = True - return result - result[DIFF_LIST] = [] - diff_outs = _get_diff_outs(self, diff_dct) - if target is None: - result[DIFF_LIST] = [ - _diff_royal(self, path, diff_outs[path]) for path in diff_outs - ] - elif target in diff_outs: - result[DIFF_LIST] = [_diff_royal(self, target, diff_outs[target])] - else: - msg = "Have not found file/directory '{}' in the commits" - raise FileNotInCommitError(msg.format(target)) - return result + return { + "old": [entry._asdict() for entry in old], + "new": [entry._asdict() for entry in new], + } diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index 15d2cc982f..e496d7cf08 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -1,267 +1,86 @@ -import os +import pytest -import dvc.repo.diff as diff -from dvc.main import main -from dvc.scm.base import FileNotInCommitError -from tests.basic_env import TestDvcGit +def test_no_scm(tmp_dir, dvc): + tmp_dir.dvc_gen("file", "text") -class TestDiff(TestDvcGit): - def setUp(self): - super().setUp() + pytest.skip("TODO: define behavior, should it fail?") - self.new_file = "new_test_file" - self.create(self.new_file, self.new_file) - stage = self.dvc.add(self.new_file)[0] - self.a_ref = self.git.git.rev_parse(self.git.head.commit, short=True) - self.new_checksum = stage.outs[0].checksum - self.git.index.add([self.new_file + ".dvc"]) - self.git.index.commit("adds new_file") - self.test_dct = { - diff.DIFF_A_REF: self.a_ref, - diff.DIFF_B_REF: self.git.git.rev_parse( - self.git.head.commit, short=True - ), - diff.DIFF_LIST: [ - { - diff.DIFF_TARGET: self.new_file, - diff.DIFF_NEW_FILE: self.new_file, - diff.DIFF_NEW_CHECKSUM: self.new_checksum, - diff.DIFF_SIZE: 13, - } - ], - } - def test(self): - out = self.dvc.scm.get_diff_trees(self.a_ref) - self.assertFalse(out[diff.DIFF_EQUAL]) - self.assertEqual(self.a_ref, out[diff.DIFF_A_REF]) - self.assertEqual( - self.git.git.rev_parse(self.git.head.commit, short=True), - out[diff.DIFF_B_REF], - ) +def test_added(tmp_dir, scm, dvc): + tmp_dir.dvc_gen("file", "text") + pytest.skip("TODO: define output structure") -class TestDiffRepo(TestDiff): - def test(self): - result = self.dvc.diff(self.a_ref, target=self.new_file) - self.assertEqual(self.test_dct, result) +def test_deleted(tmp_dir, scm, dvc): + tmp_dir.dvc_gen("file", "text", commit="add file") + (tmp_dir / "file").unlink() -class TestDiffCmdLine(TestDiff): - def test(self): - ret = main(["diff", "-t", self.new_file, self.a_ref]) - self.assertEqual(ret, 0) + pytest.skip("TODO: define output structure") -class TestDiffDir(TestDvcGit): - def setUp(self): - super().setUp() +def test_modified(tmp_dir, scm, dvc): + tmp_dir.dvc_gen("file", "first", commit="first version") + tmp_dir.dvc_gen("file", "second") - stage = self.dvc.add(self.DATA_DIR)[0] - self.git.index.add([self.DATA_DIR + ".dvc"]) - self.git.index.commit("adds data_dir") - self.a_ref = self.git.git.rev_parse( - self.dvc.scm.repo.head.commit, short=True - ) - self.old_checksum = stage.outs[0].checksum - self.new_file = os.path.join(self.DATA_SUB_DIR, diff.DIFF_NEW_FILE) - self.create(self.new_file, self.new_file) - stage = self.dvc.add(self.DATA_DIR)[0] - self.git.index.add([self.DATA_DIR + ".dvc"]) - self.git.index.commit(message="adds data_dir with new_file") - self.new_checksum = stage.outs[0].checksum + pytest.skip("TODO: define output structure") - def test(self): - out = self.dvc.scm.get_diff_trees(self.a_ref) - self.assertFalse(out[diff.DIFF_EQUAL]) - self.assertEqual(self.a_ref, out[diff.DIFF_A_REF]) - self.assertEqual( - self.git.git.rev_parse(self.git.head.commit, short=True), - out[diff.DIFF_B_REF], - ) +def test_refs(tmp_dir, scm, dvc): + tmp_dir.dvc_gen("file", "first", commit="first version") + tmp_dir.dvc_gen("file", "second", commit="second version") + tmp_dir.dvc_gen("file", "third", commit="third version") -class TestDiffDirRepo(TestDiffDir): - maxDiff = None + # dvc.diff("HEAD~1") --> (third, second) + # dvc.diff("HEAD~1", "HEAD~2") --> (second, first) + # dvc.diff("missing") --> error + pytest.skip("TODO: define output structure") - def test(self): - result = self.dvc.diff(self.a_ref, target=self.DATA_DIR) - test_dct = { - diff.DIFF_A_REF: self.git.git.rev_parse(self.a_ref, short=True), - diff.DIFF_B_REF: self.git.git.rev_parse( - self.git.head.commit, short=True - ), - diff.DIFF_LIST: [ - { - diff.DIFF_CHANGE: 0, - diff.DIFF_DEL: 0, - diff.DIFF_IDENT: 2, - diff.DIFF_MOVE: 0, - diff.DIFF_NEW: 1, - diff.DIFF_IS_DIR: True, - diff.DIFF_TARGET: self.DATA_DIR, - diff.DIFF_NEW_FILE: self.DATA_DIR, - diff.DIFF_NEW_CHECKSUM: self.new_checksum, - diff.DIFF_OLD_FILE: self.DATA_DIR, - diff.DIFF_OLD_CHECKSUM: self.old_checksum, - diff.DIFF_SIZE: 30, - } - ], - } - self.assertEqual(test_dct, result) +def test_target(tmp_dir, scm, dvc): + tmp_dir.dvc_gen("foo", "foo") + tmp_dir.dvc_gen("bar", "bar") + scm.add([".gitignore", "foo.dvc", "bar.dvc"]) + scm.commit("lowercase") -class TestDiffDirRepoDeletedFile(TestDiffDir): - maxDiff = None + tmp_dir.dvc_gen("foo", "FOO") + tmp_dir.dvc_gen("bar", "BAR") + scm.add(["foo.dvc", "bar.dvc"]) + scm.commit("uppercase") - def setUp(self): - super().setUp() + # dvc.diff("HEAD~1", target="foo") + # dvc.diff("HEAD~1", target="missing") --> error + pytest.skip("TODO: define output structure") - self.b_ref = self.a_ref - tmp = self.new_checksum - self.new_checksum = self.old_checksum - self.a_ref = str(self.dvc.scm.repo.head.commit) - self.old_checksum = tmp - def test(self): - result = self.dvc.diff( - self.a_ref, b_ref=self.b_ref, target=self.DATA_DIR - ) - test_dct = { - diff.DIFF_A_REF: self.git.git.rev_parse(self.a_ref, short=True), - diff.DIFF_B_REF: self.git.git.rev_parse(self.b_ref, short=True), - diff.DIFF_LIST: [ - { - diff.DIFF_CHANGE: 0, - diff.DIFF_DEL: 1, - diff.DIFF_IDENT: 2, - diff.DIFF_MOVE: 0, - diff.DIFF_NEW: 0, - diff.DIFF_IS_DIR: True, - diff.DIFF_TARGET: self.DATA_DIR, - diff.DIFF_NEW_FILE: self.DATA_DIR, - diff.DIFF_NEW_CHECKSUM: self.new_checksum, - diff.DIFF_OLD_FILE: self.DATA_DIR, - diff.DIFF_OLD_CHECKSUM: self.old_checksum, - diff.DIFF_SIZE: -30, - } - ], - } - self.assertEqual(test_dct, result) +def test_directories(tmp_dir, scm, dvc): + tmp_dir.dvc_gen({"dir": {"1": "1", "2": "2"}}, commit="add a directory") + tmp_dir.dvc_gen({"dir": {"2": "2"}}, commit="delete a file") + tmp_dir.dvc_gen({"dir": {"2": "two"}}, commit="modify a file") + tmp_dir.dvc_gen({"dir": {"2": "two", "3": "3"}}, commit="add a file") + # dvc.diff(":/directory", ":/init") --> (add: dir/1, dir/2) + # dvc.diff(":/delete", ":/directory") --> (deleted: dir/1) + # dvc.diff(":/modify", ":/directory") --> (modified: dir/2) + # dvc.diff(":/modify", ":/delete") --> (add: dir/3) + # dvc.diff(":/add a file", ":/modify") + pytest.skip("TODO: define output structure") -class TestDiffFileNotFound(TestDiffDir): - def setUp(self): - super().setUp() - self.unknown_file = "unknown_file_" + str(id(self)) - def test(self): - with self.assertRaises(FileNotInCommitError): - self.dvc.diff(self.a_ref, target=self.unknown_file) +def test_cli(tmp_dir, scm, dvc): + tmp_dir.dvc_gen("file", "text") + # main(["diff"]) + pytest.skip("TODO: define output structure") -class TestDiffModifiedFile(TestDiff): - maxDiff = None +def test_json(tmp_dir, scm, dvc): + # result = { + # "added": {...}, + # "renamed": {...}, + # "modified": {...}, + # "deleted": {...}, + # } - def setUp(self): - super().setUp() - - self.old_checksum = self.new_checksum - self.new_file_content = "new_test_file_bigger_content_123456789" - self.diff_len = len(self.new_file) + len(self.new_file_content) - self.create(self.new_file, self.new_file_content) - stage = self.dvc.add(self.new_file)[0] - self.git.index.add([self.new_file + ".dvc"]) - self.git.index.commit("change new_file content to be bigger") - self.new_checksum = stage.outs[0].checksum - self.b_ref = self.git.git.rev_parse(self.git.head.commit, short=True) - - def test(self): - result = self.dvc.diff( - self.a_ref, b_ref=self.b_ref, target=self.new_file - ) - test_dct = { - diff.DIFF_A_REF: self.git.git.rev_parse(self.a_ref, short=True), - diff.DIFF_B_REF: self.git.git.rev_parse(self.b_ref, short=True), - diff.DIFF_LIST: [ - { - diff.DIFF_NEW_CHECKSUM: self.new_checksum, - diff.DIFF_NEW_FILE: self.new_file, - diff.DIFF_TARGET: self.new_file, - diff.DIFF_SIZE: self.diff_len, - } - ], - } - self.assertEqual(test_dct, result) - - -class TestDiffDirWithFile(TestDiffDir): - maxDiff = None - - def setUp(self): - super().setUp() - - self.a_ref = self.git.git.rev_parse(self.git.head.commit, short=True) - self.old_checksum = self.new_checksum - self.new_file_content = "new_test_file_bigger_content_123456789" - self.diff_len = len(self.new_file_content) - self.create(self.new_file, self.new_file_content) - stage = self.dvc.add(self.DATA_DIR)[0] - self.git.index.add([self.DATA_DIR + ".dvc"]) - self.git.index.commit(message="modify file in the data dir") - self.new_checksum = stage.outs[0].checksum - self.b_ref = self.git.git.rev_parse(self.git.head.commit, short=True) - - def test(self): - result = self.dvc.diff(self.a_ref, target=self.DATA_DIR) - test_dct = { - diff.DIFF_A_REF: self.git.git.rev_parse(self.a_ref, short=True), - diff.DIFF_B_REF: self.git.git.rev_parse(self.b_ref, short=True), - diff.DIFF_LIST: [ - { - diff.DIFF_IDENT: 2, - diff.DIFF_CHANGE: 1, - diff.DIFF_DEL: 0, - diff.DIFF_MOVE: 0, - diff.DIFF_NEW: 0, - diff.DIFF_IS_DIR: True, - diff.DIFF_TARGET: self.DATA_DIR, - diff.DIFF_NEW_FILE: self.DATA_DIR, - diff.DIFF_NEW_CHECKSUM: self.new_checksum, - diff.DIFF_OLD_FILE: self.DATA_DIR, - diff.DIFF_OLD_CHECKSUM: self.old_checksum, - diff.DIFF_SIZE: self.diff_len, - } - ], - } - self.assertEqual(test_dct, result) - - -class TestDiffCmdMessage(TestDiff): - maxDiff = None - - def test(self): - ret = main( - [ - "diff", - self.test_dct[diff.DIFF_A_REF], - self.test_dct[diff.DIFF_B_REF], - ] - ) - self.assertEqual(ret, 0) - - msg1 = "dvc diff from {0} to {1}".format( - self.git.git.rev_parse(self.test_dct[diff.DIFF_A_REF], short=True), - self.git.git.rev_parse(self.test_dct[diff.DIFF_B_REF], short=True), - ) - msg2 = "diff for '{0}'".format( - self.test_dct[diff.DIFF_LIST][0][diff.DIFF_TARGET] - ) - msg3 = "+{0} with md5 {1}".format( - self.test_dct[diff.DIFF_LIST][0][diff.DIFF_TARGET], - self.test_dct[diff.DIFF_LIST][0][diff.DIFF_NEW_CHECKSUM], - ) - msg4 = "added file with size 13 Bytes" - for m in [msg1, msg2, msg3, msg4]: - assert m in self._caplog.text + # main(["diff", "--json"]) + pytest.skip("TODO: define output structure") From ebd5a5a7f3069ebf176b2888617ef2430b718f87 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Fri, 24 Jan 2020 10:05:44 -0600 Subject: [PATCH 02/35] diff: address comments from review * OSError -> FileNotFoundError * Make `a_ref` optional * Use absolute imports for unrelated packages * Relpath: `str(output.path_info)` -> `str(output)` * `a_ref` defaults to `HEAD` at the signature and parser level --- dvc/command/diff.py | 7 +++++-- dvc/repo/diff.py | 13 +++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/dvc/command/diff.py b/dvc/command/diff.py index 74f15edf56..ae7bc56a42 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -32,13 +32,16 @@ def add_parser(subparsers, parent_parser): formatter_class=argparse.RawDescriptionHelpFormatter, ) diff_parser.add_argument( - "a_ref", help="Git reference from which diff calculates" + "a_ref", + help="Git reference from which diff calculates (defaults to HEAD)", + nargs="?", + default="HEAD" ) diff_parser.add_argument( "b_ref", help=( "Git reference until which diff calculates, if omitted " - "diff shows the difference between current HEAD and a_ref" + "diff shows the difference between the working tree and a_ref" ), nargs="?", ) diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index dda1e0c79e..9d3604e6c8 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -1,8 +1,8 @@ import os import collections -from . import locked -from ..compat import fspath +from dvc.repo import locked +from dvc.compat import fspath Diffable = collections.namedtuple("Diffable", "filename, checksum, size") @@ -12,16 +12,14 @@ def diffable_from_output(output): try: size = os.path.getsize(fspath(output.path_info)) - except OSError: + except FileNotFoundError: size = None - return Diffable( - filename=str(output.path_info), checksum=output.checksum, size=size - ) + return Diffable(filename=str(output), checksum=output.checksum, size=size) @locked -def diff(self, a_ref=None, b_ref=None, *, target=None): +def diff(self, a_ref="HEAD", b_ref=None, *, target=None): """ By default, it compares the working tree with the last commit's tree. @@ -30,7 +28,6 @@ def diff(self, a_ref=None, b_ref=None, *, target=None): This implementation differs from `git diff`, since DVC doesn't have the concept of `index`, `dvc diff` would be the same as `dvc diff HEAD`. """ - a_ref = a_ref or "HEAD" outs = {} for branch in self.brancher(revs=[a_ref, b_ref]): From b2de740556658c73e7fc134273219e383246915a Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Fri, 24 Jan 2020 10:29:59 -0600 Subject: [PATCH 03/35] tests: add result examples --- dvc/command/diff.py | 2 +- tests/func/test_diff.py | 45 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/dvc/command/diff.py b/dvc/command/diff.py index ae7bc56a42..df576c8dfa 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -35,7 +35,7 @@ def add_parser(subparsers, parent_parser): "a_ref", help="Git reference from which diff calculates (defaults to HEAD)", nargs="?", - default="HEAD" + default="HEAD", ) diff_parser.add_argument( "b_ref", diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index e496d7cf08..e4662a4710 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -10,21 +10,60 @@ def test_no_scm(tmp_dir, dvc): def test_added(tmp_dir, scm, dvc): tmp_dir.dvc_gen("file", "text") - pytest.skip("TODO: define output structure") + result = { + "old": [], + "new": [ + { + "filename": "file", + "checksum": "1cb251ec0d568de6a929b520c4aed8d1", + "size": 4, + } + ], + } + + assert result == dvc.diff() def test_deleted(tmp_dir, scm, dvc): tmp_dir.dvc_gen("file", "text", commit="add file") (tmp_dir / "file").unlink() - pytest.skip("TODO: define output structure") + result = { + "old": [ + { + "filename": "file", + "checksum": "1cb251ec0d568de6a929b520c4aed8d1", + "size": 4, + } + ], + "new": [], + } + + assert result == dvc.diff() def test_modified(tmp_dir, scm, dvc): tmp_dir.dvc_gen("file", "first", commit="first version") tmp_dir.dvc_gen("file", "second") - pytest.skip("TODO: define output structure") + result = { + "old": [ + { + "filename": "file", + "checksum": "8b04d5e3775d298e78455efc5ca404d5", + "size": 6, + } + ], + "new": [ + { + "filename": "file", + "checksum": "a9f0e61a137d86aa9db53465e0801612", + "size": 6, + } + ], + } + + assert result == dvc.diff() def test_refs(tmp_dir, scm, dvc): From f791371224cc3bdf16f0a03ef9195489aed78237 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Fri, 24 Jan 2020 13:29:09 -0600 Subject: [PATCH 04/35] diff: change structure to be indexed by path --- dvc/command/diff.py | 4 +++- dvc/repo/diff.py | 33 +++++++++++++++++++++++------ tests/func/test_diff.py | 47 ++++++++++++++--------------------------- 3 files changed, 46 insertions(+), 38 deletions(-) diff --git a/dvc/command/diff.py b/dvc/command/diff.py index df576c8dfa..7ec4f24f32 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -11,7 +11,9 @@ class CmdDiff(CmdBase): def run(self): try: - self.repo.diff(self.args.a_ref, self.args.b_ref, self.args.target) + self.repo.diff( + self.args.a_ref, self.args.b_ref, target=self.args.target + ) except DvcException: logger.exception("failed to get 'diff {}'") return 1 diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index 9d3604e6c8..2f709f401d 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -7,6 +7,7 @@ Diffable = collections.namedtuple("Diffable", "filename, checksum, size") Diffable.__doc__ = "Common interface for comparable entries." +Diffable._asdict = lambda x: {"checksum": x.checksum, "size": x.size} def diffable_from_output(output): @@ -18,6 +19,21 @@ def diffable_from_output(output): return Diffable(filename=str(output), checksum=output.checksum, size=size) +def compare_states(old, new): + if old and new: + try: + size = new["size"] - old["size"] + except KeyError: + size = "unknown" + return {"status": "modified", "size": size} + + if old and not new: + return {"status": "deleted", "size": old.get("size", "unknown")} + + if not old and new: + return {"status": "added", "size": new.get("size", "unknown")} + + @locked def diff(self, a_ref="HEAD", b_ref=None, *, target=None): """ @@ -44,10 +60,15 @@ def diff(self, a_ref="HEAD", b_ref=None, *, target=None): if not delta: return - old &= delta - new &= delta + result = {} + + for entry in delta: + result[entry.filename] = { + "old": entry._asdict() if entry in old else {}, + "new": entry._asdict() if entry in new else {}, + } + + for filename, entry in result.items(): + entry.update({"diff": compare_states(entry["old"], entry["new"])}) - return { - "old": [entry._asdict() for entry in old], - "new": [entry._asdict() for entry in new], - } + return result diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index e4662a4710..4a26af3693 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -11,14 +11,11 @@ def test_added(tmp_dir, scm, dvc): tmp_dir.dvc_gen("file", "text") result = { - "old": [], - "new": [ - { - "filename": "file", - "checksum": "1cb251ec0d568de6a929b520c4aed8d1", - "size": 4, - } - ], + "file": { + "old": {}, + "new": {"checksum": "1cb251ec0d568de6a929b520c4aed8d1", "size": 4}, + "diff": {"status": "added", "size": 4}, + } } assert result == dvc.diff() @@ -26,17 +23,14 @@ def test_added(tmp_dir, scm, dvc): def test_deleted(tmp_dir, scm, dvc): tmp_dir.dvc_gen("file", "text", commit="add file") - (tmp_dir / "file").unlink() + (tmp_dir / "file.dvc").unlink() result = { - "old": [ - { - "filename": "file", - "checksum": "1cb251ec0d568de6a929b520c4aed8d1", - "size": 4, - } - ], - "new": [], + "file": { + "old": {"checksum": "1cb251ec0d568de6a929b520c4aed8d1", "size": 4}, + "new": {}, + "diff": {"status": "deleted", "size": 4}, + } } assert result == dvc.diff() @@ -47,20 +41,11 @@ def test_modified(tmp_dir, scm, dvc): tmp_dir.dvc_gen("file", "second") result = { - "old": [ - { - "filename": "file", - "checksum": "8b04d5e3775d298e78455efc5ca404d5", - "size": 6, - } - ], - "new": [ - { - "filename": "file", - "checksum": "a9f0e61a137d86aa9db53465e0801612", - "size": 6, - } - ], + "file": { + "old": {"checksum": "8b04d5e3775d298e78455efc5ca404d5", "size": 6}, + "new": {}, + "diff": {"status": "deleted", "size": 6}, + } } assert result == dvc.diff() From bc8871eccde98d451b9c33ba0d1b354adbaca476 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Fri, 24 Jan 2020 13:48:54 -0600 Subject: [PATCH 05/35] diff: adjust some wording --- dvc/repo/diff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index 2f709f401d..499cbb0027 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -6,7 +6,7 @@ Diffable = collections.namedtuple("Diffable", "filename, checksum, size") -Diffable.__doc__ = "Common interface for comparable entries." +Diffable.__doc__ = "Common interface to compare outputs." Diffable._asdict = lambda x: {"checksum": x.checksum, "size": x.size} From 576a27667fd3ca18a7682ce26928f813fad54c2a Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Fri, 24 Jan 2020 19:05:23 -0600 Subject: [PATCH 06/35] diff: fix some tests and adjust result for modify --- dvc/repo/diff.py | 11 ++++++----- tests/func/test_diff.py | 4 ++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index 499cbb0027..381f779f68 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -60,13 +60,14 @@ def diff(self, a_ref="HEAD", b_ref=None, *, target=None): if not delta: return - result = {} + result = collections.defaultdict(dict) for entry in delta: - result[entry.filename] = { - "old": entry._asdict() if entry in old else {}, - "new": entry._asdict() if entry in new else {}, - } + states = result[entry.filename] + states.update( + old=states.get("old") or (entry._asdict() if entry in old else {}), + new=states.get("new") or (entry._asdict() if entry in new else {}), + ) for filename, entry in result.items(): entry.update({"diff": compare_states(entry["old"], entry["new"])}) diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index 4a26af3693..9a703a5dff 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -43,8 +43,8 @@ def test_modified(tmp_dir, scm, dvc): result = { "file": { "old": {"checksum": "8b04d5e3775d298e78455efc5ca404d5", "size": 6}, - "new": {}, - "diff": {"status": "deleted", "size": 6}, + "new": {"checksum": "a9f0e61a137d86aa9db53465e0801612", "size": 6}, + "diff": {"status": "modified", "size": 0}, } } From 07abd1b41bdbdd7825e389e3c485d7e8da91cf0a Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Sat, 25 Jan 2020 10:59:03 -0600 Subject: [PATCH 07/35] diff: add support for specifying a target --- dvc/repo/diff.py | 7 ++++-- tests/func/test_diff.py | 50 ++++++++++++++++++++++++++++++++--------- 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index 381f779f68..3ef1d890a2 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -40,8 +40,9 @@ def diff(self, a_ref="HEAD", b_ref=None, *, target=None): By default, it compares the working tree with the last commit's tree. When a `target` path is given, it only shows that file's comparison. + It will silently fail when `target` is not found in any of the trees. - This implementation differs from `git diff`, since DVC doesn't have + This implementation differs from `git diff` since DVC doesn't have the concept of `index`, `dvc diff` would be the same as `dvc diff HEAD`. """ outs = {} @@ -52,11 +53,13 @@ def diff(self, a_ref="HEAD", b_ref=None, *, target=None): for stage in self.stages for out in stage.outs ) - old = outs[a_ref] new = outs[b_ref or "working tree"] delta = old ^ new + if target: + delta = set(x for x in delta if x.filename == target) + if not delta: return diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index 9a703a5dff..12e56db199 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -1,4 +1,9 @@ import pytest +import hashlib + + +def _checksum(text): + return hashlib.md5(bytes(text, "utf-8")).hexdigest() def test_no_scm(tmp_dir, dvc): @@ -13,7 +18,7 @@ def test_added(tmp_dir, scm, dvc): result = { "file": { "old": {}, - "new": {"checksum": "1cb251ec0d568de6a929b520c4aed8d1", "size": 4}, + "new": {"checksum": _checksum("text"), "size": 4}, "diff": {"status": "added", "size": 4}, } } @@ -27,7 +32,7 @@ def test_deleted(tmp_dir, scm, dvc): result = { "file": { - "old": {"checksum": "1cb251ec0d568de6a929b520c4aed8d1", "size": 4}, + "old": {"checksum": _checksum("text"), "size": 4}, "new": {}, "diff": {"status": "deleted", "size": 4}, } @@ -42,8 +47,8 @@ def test_modified(tmp_dir, scm, dvc): result = { "file": { - "old": {"checksum": "8b04d5e3775d298e78455efc5ca404d5", "size": 6}, - "new": {"checksum": "a9f0e61a137d86aa9db53465e0801612", "size": 6}, + "old": {"checksum": _checksum("first"), "size": 6}, + "new": {"checksum": _checksum("second"), "size": 6}, "diff": {"status": "modified", "size": 0}, } } @@ -56,10 +61,27 @@ def test_refs(tmp_dir, scm, dvc): tmp_dir.dvc_gen("file", "second", commit="second version") tmp_dir.dvc_gen("file", "third", commit="third version") - # dvc.diff("HEAD~1") --> (third, second) - # dvc.diff("HEAD~1", "HEAD~2") --> (second, first) - # dvc.diff("missing") --> error - pytest.skip("TODO: define output structure") + HEAD_2 = _checksum("first") + HEAD_1 = _checksum("second") + HEAD = _checksum("third") + + assert dvc.diff("HEAD~1") == { + "file": { + "old": {"checksum": HEAD_1, "size": 5}, + "new": {"checksum": HEAD, "size": 5}, + "diff": {"status": "modified", "size": 0}, + } + } + + assert dvc.diff("HEAD~1", "HEAD~2") == { + "file": { + "old": {"checksum": HEAD_2, "size": 5}, + "new": {"checksum": HEAD_1, "size": 5}, + "diff": {"status": "modified", "size": 0}, + } + } + + pytest.skip('TODO: test dvc.diff("missing")') def test_target(tmp_dir, scm, dvc): @@ -73,9 +95,15 @@ def test_target(tmp_dir, scm, dvc): scm.add(["foo.dvc", "bar.dvc"]) scm.commit("uppercase") - # dvc.diff("HEAD~1", target="foo") - # dvc.diff("HEAD~1", target="missing") --> error - pytest.skip("TODO: define output structure") + assert dvc.diff("HEAD~1", target="foo") == { + "foo": { + "old": {"checksum": _checksum("foo"), "size": 3}, + "new": {"checksum": _checksum("FOO"), "size": 3}, + "diff": {"status": "modified", "size": 0}, + } + } + + assert not dvc.diff("HEAD~1", target="missing") def test_directories(tmp_dir, scm, dvc): From 5ace747c05649965a994b279a9e3e1ea763225be Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Sun, 26 Jan 2020 01:40:57 -0600 Subject: [PATCH 08/35] diff: group by change state and support dirs --- dvc/repo/diff.py | 80 ++++++++++------------ tests/func/test_diff.py | 143 +++++++++++++++++++++++++++------------- 2 files changed, 134 insertions(+), 89 deletions(-) diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index 3ef1d890a2..78a6bc9927 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -1,37 +1,24 @@ -import os import collections +import os from dvc.repo import locked -from dvc.compat import fspath -Diffable = collections.namedtuple("Diffable", "filename, checksum, size") +Diffable = collections.namedtuple("Diffable", "filename, checksum") Diffable.__doc__ = "Common interface to compare outputs." -Diffable._asdict = lambda x: {"checksum": x.checksum, "size": x.size} - - -def diffable_from_output(output): - try: - size = os.path.getsize(fspath(output.path_info)) - except FileNotFoundError: - size = None - - return Diffable(filename=str(output), checksum=output.checksum, size=size) - +Diffable.__eq__ = lambda self, other: self.filename == other.filename +Diffable.__hash__ = lambda self: hash(self.filename) -def compare_states(old, new): - if old and new: - try: - size = new["size"] - old["size"] - except KeyError: - size = "unknown" - return {"status": "modified", "size": size} - if old and not new: - return {"status": "deleted", "size": old.get("size", "unknown")} +def diffables_from_output(output): + if not output.is_dir_checksum: + return [Diffable(str(output), output.checksum)] - if not old and new: - return {"status": "added", "size": new.get("size", "unknown")} + # Unpack directory and include its outputs in the result + return [Diffable(os.path.join(str(output), ""), output.checksum)] + [ + Diffable(str(output.path_info / entry["relpath"]), entry["md5"]) + for entry in output.dir_cache + ] @locked @@ -40,39 +27,44 @@ def diff(self, a_ref="HEAD", b_ref=None, *, target=None): By default, it compares the working tree with the last commit's tree. When a `target` path is given, it only shows that file's comparison. - It will silently fail when `target` is not found in any of the trees. This implementation differs from `git diff` since DVC doesn't have - the concept of `index`, `dvc diff` would be the same as `dvc diff HEAD`. + the concept of `index`, but it keeps the same interface, thus, + `dvc diff` would be the same as `dvc diff HEAD`. """ outs = {} for branch in self.brancher(revs=[a_ref, b_ref]): outs[branch] = set( - diffable_from_output(out) + diffable for stage in self.stages for out in stage.outs + for diffable in diffables_from_output(out) + if not target or target == str(out) ) + old = outs[a_ref] new = outs[b_ref or "working tree"] - delta = old ^ new - - if target: - delta = set(x for x in delta if x.filename == target) - if not delta: - return - - result = collections.defaultdict(dict) + added = new - old + deleted = old - new + delta = old ^ new - for entry in delta: - states = result[entry.filename] - states.update( - old=states.get("old") or (entry._asdict() if entry in old else {}), - new=states.get("new") or (entry._asdict() if entry in new else {}), + result = { + "added": [entry._asdict() for entry in sorted(added)], + "deleted": [entry._asdict() for entry in sorted(deleted)], + "modified": [], + } + + for _old, _new in zip(sorted(old - delta), sorted(new - delta)): + if _old.checksum == _new.checksum: + continue + + result["modified"].append( + { + "filename": _new.filename, + "checksum": {"old": _old.checksum, "new": _new.checksum}, + } ) - for filename, entry in result.items(): - entry.update({"diff": compare_states(entry["old"], entry["new"])}) - return result diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index 12e56db199..889878d682 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -1,8 +1,9 @@ -import pytest import hashlib +import os +import pytest -def _checksum(text): +def digest(text): return hashlib.md5(bytes(text, "utf-8")).hexdigest() @@ -16,11 +17,9 @@ def test_added(tmp_dir, scm, dvc): tmp_dir.dvc_gen("file", "text") result = { - "file": { - "old": {}, - "new": {"checksum": _checksum("text"), "size": 4}, - "diff": {"status": "added", "size": 4}, - } + "added": [{"filename": "file", "checksum": digest("text")}], + "deleted": [], + "modified": [], } assert result == dvc.diff() @@ -31,11 +30,9 @@ def test_deleted(tmp_dir, scm, dvc): (tmp_dir / "file.dvc").unlink() result = { - "file": { - "old": {"checksum": _checksum("text"), "size": 4}, - "new": {}, - "diff": {"status": "deleted", "size": 4}, - } + "added": [], + "deleted": [{"filename": "file", "checksum": digest("text")}], + "modified": [], } assert result == dvc.diff() @@ -46,11 +43,14 @@ def test_modified(tmp_dir, scm, dvc): tmp_dir.dvc_gen("file", "second") result = { - "file": { - "old": {"checksum": _checksum("first"), "size": 6}, - "new": {"checksum": _checksum("second"), "size": 6}, - "diff": {"status": "modified", "size": 0}, - } + "added": [], + "deleted": [], + "modified": [ + { + "filename": "file", + "checksum": {"old": digest("first"), "new": digest("second")}, + } + ], } assert result == dvc.diff() @@ -61,24 +61,24 @@ def test_refs(tmp_dir, scm, dvc): tmp_dir.dvc_gen("file", "second", commit="second version") tmp_dir.dvc_gen("file", "third", commit="third version") - HEAD_2 = _checksum("first") - HEAD_1 = _checksum("second") - HEAD = _checksum("third") + HEAD_2 = digest("first") + HEAD_1 = digest("second") + HEAD = digest("third") assert dvc.diff("HEAD~1") == { - "file": { - "old": {"checksum": HEAD_1, "size": 5}, - "new": {"checksum": HEAD, "size": 5}, - "diff": {"status": "modified", "size": 0}, - } + "added": [], + "deleted": [], + "modified": [ + {"filename": "file", "checksum": {"old": HEAD_1, "new": HEAD}} + ], } - assert dvc.diff("HEAD~1", "HEAD~2") == { - "file": { - "old": {"checksum": HEAD_2, "size": 5}, - "new": {"checksum": HEAD_1, "size": 5}, - "diff": {"status": "modified", "size": 0}, - } + assert dvc.diff("HEAD~2", "HEAD~1") == { + "added": [], + "deleted": [], + "modified": [ + {"filename": "file", "checksum": {"old": HEAD_2, "new": HEAD_1}} + ], } pytest.skip('TODO: test dvc.diff("missing")') @@ -96,28 +96,81 @@ def test_target(tmp_dir, scm, dvc): scm.commit("uppercase") assert dvc.diff("HEAD~1", target="foo") == { - "foo": { - "old": {"checksum": _checksum("foo"), "size": 3}, - "new": {"checksum": _checksum("FOO"), "size": 3}, - "diff": {"status": "modified", "size": 0}, - } + "added": [], + "deleted": [], + "modified": [ + { + "filename": "foo", + "checksum": {"old": digest("foo"), "new": digest("FOO")}, + } + ], } - assert not dvc.diff("HEAD~1", target="missing") + assert dvc.diff("HEAD~1", target="missing") == { + "added": [], + "deleted": [], + "modified": [], + } def test_directories(tmp_dir, scm, dvc): tmp_dir.dvc_gen({"dir": {"1": "1", "2": "2"}}, commit="add a directory") - tmp_dir.dvc_gen({"dir": {"2": "2"}}, commit="delete a file") + tmp_dir.dvc_gen({"dir": {"3": "3"}}, commit="add a file") tmp_dir.dvc_gen({"dir": {"2": "two"}}, commit="modify a file") - tmp_dir.dvc_gen({"dir": {"2": "two", "3": "3"}}, commit="add a file") - # dvc.diff(":/directory", ":/init") --> (add: dir/1, dir/2) - # dvc.diff(":/delete", ":/directory") --> (deleted: dir/1) - # dvc.diff(":/modify", ":/directory") --> (modified: dir/2) - # dvc.diff(":/modify", ":/delete") --> (add: dir/3) - # dvc.diff(":/add a file", ":/modify") - pytest.skip("TODO: define output structure") + (tmp_dir / "dir" / "2").unlink() + dvc.add("dir") + scm.add("dir.dvc") + scm.commit("delete a file") + + assert dvc.diff(":/init", ":/directory") == { + "added": [ + { + "filename": "dir/", + "checksum": "5fb6b29836c388e093ca0715c872fe2a.dir", + }, + {"filename": os.path.join("dir", "1"), "checksum": digest("1")}, + {"filename": "dir/2", "checksum": digest("2")}, + ], + "deleted": [], + "modified": [], + } + + assert dvc.diff(":/directory", ":/modify") == { + "added": [ + {"filename": os.path.join("dir", "3"), "checksum": digest("3")} + ], + "deleted": [], + "modified": [ + { + "filename": os.path.join("dir", ""), + "checksum": { + "old": "5fb6b29836c388e093ca0715c872fe2a.dir", + "new": "9b5faf37366b3370fd98e3e60ca439c1.dir", + }, + }, + { + "filename": os.path.join("dir", "2"), + "checksum": {"old": digest("2"), "new": digest("two")}, + }, + ], + } + + assert dvc.diff(":/modify", ":/delete") == { + "added": [], + "deleted": [ + {"filename": os.path.join("dir", "2"), "checksum": digest("two")} + ], + "modified": [ + { + "filename": os.path.join("dir", ""), + "checksum": { + "old": "9b5faf37366b3370fd98e3e60ca439c1.dir", + "new": "83ae82fb367ac9926455870773ff09e6.dir", + }, + } + ], + } def test_cli(tmp_dir, scm, dvc): From 3db983fc2ed200514c2ad6fb250fefe2329aaeb1 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Sun, 26 Jan 2020 02:00:38 -0600 Subject: [PATCH 09/35] tests/diff: use os.path.join instead of implying separator --- tests/func/test_diff.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index 889878d682..ce92caa4b3 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -126,11 +126,11 @@ def test_directories(tmp_dir, scm, dvc): assert dvc.diff(":/init", ":/directory") == { "added": [ { - "filename": "dir/", + "filename": os.path.dir("dir", ""), "checksum": "5fb6b29836c388e093ca0715c872fe2a.dir", }, {"filename": os.path.join("dir", "1"), "checksum": digest("1")}, - {"filename": "dir/2", "checksum": digest("2")}, + {"filename": os.path.join("dir", "2"), "checksum": digest("2")}, ], "deleted": [], "modified": [], From b251682aece9cc49631d820cefb8491ddad27dd6 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Sun, 26 Jan 2020 12:54:07 -0600 Subject: [PATCH 10/35] diff: document diffables_from_output --- dvc/repo/diff.py | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index 78a6bc9927..64d6348607 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -11,14 +11,36 @@ def diffables_from_output(output): - if not output.is_dir_checksum: - return [Diffable(str(output), output.checksum)] - - # Unpack directory and include its outputs in the result - return [Diffable(os.path.join(str(output), ""), output.checksum)] + [ - Diffable(str(output.path_info / entry["relpath"]), entry["md5"]) - for entry in output.dir_cache - ] + """ + Transform an output into a list of Diffable objects so we can + compare them lately. + + Unpack directories to include entries' Diffables. + + To help distinguish between an a directory output and a file output, + the former one will come with a trailing slash in the filename: + + directory: "data/" + file: "data" + + You can also rely on the checksum to tell whether it was computed for + a file or a directory as a whole. + """ + if output.is_dir_checksum: + return [ + Diffable( + filename=os.path.join(str(output), ""), + checksum=output.checksum, + ) + ] + [ + Diffable( + filename=str(output.path_info / entry["relpath"]), + checksum=entry["md5"], + ) + for entry in output.dir_cache + ] + + return [Diffable(filename=str(output), checksum=output.checksum)] @locked From a4ade13c0abaaa51a0f7e33e24a00e6126571a1c Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Sun, 26 Jan 2020 13:20:31 -0600 Subject: [PATCH 11/35] diff: add JSON output --- dvc/command/diff.py | 16 +++++++++++++++- tests/func/test_diff.py | 29 +++++++++++++++++++---------- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/dvc/command/diff.py b/dvc/command/diff.py index 7ec4f24f32..7f03b1c5c4 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -1,4 +1,5 @@ import argparse +import json import logging from dvc.command.base import CmdBase, append_doc_link @@ -11,9 +12,16 @@ class CmdDiff(CmdBase): def run(self): try: - self.repo.diff( + diff = self.repo.diff( self.args.a_ref, self.args.b_ref, target=self.args.target ) + if not any(diff.values()): + return 0 + + if self.args.json: + print(json.dumps(diff)) + return 0 + except DvcException: logger.exception("failed to get 'diff {}'") return 1 @@ -56,4 +64,10 @@ def add_parser(subparsers, parent_parser): "that are under DVC control in the current working space." ), ) + diff_parser.add_argument( + "--json", + help=("Format the output into a JSON"), + action="store_true", + default=False, + ) diff_parser.set_defaults(func=CmdDiff) diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index ce92caa4b3..192b568870 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -1,7 +1,10 @@ import hashlib +import json import os import pytest +from dvc.main import main + def digest(text): return hashlib.md5(bytes(text, "utf-8")).hexdigest() @@ -126,7 +129,7 @@ def test_directories(tmp_dir, scm, dvc): assert dvc.diff(":/init", ":/directory") == { "added": [ { - "filename": os.path.dir("dir", ""), + "filename": os.path.join("dir", ""), "checksum": "5fb6b29836c388e093ca0715c872fe2a.dir", }, {"filename": os.path.join("dir", "1"), "checksum": digest("1")}, @@ -179,13 +182,19 @@ def test_cli(tmp_dir, scm, dvc): pytest.skip("TODO: define output structure") -def test_json(tmp_dir, scm, dvc): - # result = { - # "added": {...}, - # "renamed": {...}, - # "modified": {...}, - # "deleted": {...}, - # } +def test_json(tmp_dir, scm, dvc, capsys): + assert 0 == main(["diff", "--json"]) + assert not capsys.readouterr().out - # main(["diff", "--json"]) - pytest.skip("TODO: define output structure") + tmp_dir.dvc_gen("file", "text") + assert 0 == main(["diff", "--json"]) + assert ( + json.dumps( + { + "added": [{"filename": "file", "checksum": digest("text")}], + "deleted": [], + "modified": [], + } + ) + in capsys.readouterr().out + ) From bd1adf2c23c27435c76875951163be747d9c9ec6 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Sun, 26 Jan 2020 15:06:30 -0600 Subject: [PATCH 12/35] diff: add CLI output --- dvc/command/diff.py | 46 ++++++++++++++++++++++++++++++++ dvc/repo/diff.py | 4 +-- tests/func/test_diff.py | 59 +++++++++++++++++++++++++++++++++++------ 3 files changed, 99 insertions(+), 10 deletions(-) diff --git a/dvc/command/diff.py b/dvc/command/diff.py index 7f03b1c5c4..9e43709b4e 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -2,6 +2,8 @@ import json import logging +import colorama + from dvc.command.base import CmdBase, append_doc_link from dvc.exceptions import DvcException @@ -10,6 +12,48 @@ class CmdDiff(CmdBase): + @staticmethod + def _format(diff): + """ + Given a diff structure, generate a string of filenames separated + by new lines and grouped together by their state. + + A group's header is colored and its entries are sorted to enhance + readability, for example: + + Added: + another_file.txt + backup.tar + dir/ + dir/1 + + If a group has no entries, it won't be included in the result. + """ + colors = { + "added": colorama.Fore.GREEN, + "modified": colorama.Fore.YELLOW, + "deleted": colorama.Fore.RED, + } + + groups = [] + + for key, values in diff.items(): + if not values: + continue + + entries = sorted(" " + entry["filename"] for entry in values) + + groups.append( + "{color}{header}{nc}:\n{entries}".format( + color=colors[key], + header=key.capitalize(), + nc=colorama.Fore.RESET, + entries="\n".join(entries), + ) + ) + + return "\n\n".join(groups) + def run(self): try: diff = self.repo.diff( @@ -22,6 +66,8 @@ def run(self): print(json.dumps(diff)) return 0 + print(self._format(diff)) + except DvcException: logger.exception("failed to get 'diff {}'") return 1 diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index 64d6348607..a1ec334c03 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -10,7 +10,7 @@ Diffable.__hash__ = lambda self: hash(self.filename) -def diffables_from_output(output): +def _diffables_from_output(output): """ Transform an output into a list of Diffable objects so we can compare them lately. @@ -61,7 +61,7 @@ def diff(self, a_ref="HEAD", b_ref=None, *, target=None): diffable for stage in self.stages for out in stage.outs - for diffable in diffables_from_output(out) + for diffable in _diffables_from_output(out) if not target or target == str(out) ) diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index 192b568870..f7bb34d086 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -3,6 +3,8 @@ import os import pytest +import colorama + from dvc.main import main @@ -176,17 +178,16 @@ def test_directories(tmp_dir, scm, dvc): } -def test_cli(tmp_dir, scm, dvc): - tmp_dir.dvc_gen("file", "text") - # main(["diff"]) - pytest.skip("TODO: define output structure") - - -def test_json(tmp_dir, scm, dvc, capsys): +def test_json(tmp_dir, scm, dvc, mocker, capsys): assert 0 == main(["diff", "--json"]) assert not capsys.readouterr().out - tmp_dir.dvc_gen("file", "text") + diff = { + "added": [{"filename": "file", "checksum": digest("text")}], + "deleted": [], + "modified": [], + } + mocker.patch("dvc.repo.Repo.diff", return_value=diff) assert 0 == main(["diff", "--json"]) assert ( json.dumps( @@ -198,3 +199,45 @@ def test_json(tmp_dir, scm, dvc, capsys): ) in capsys.readouterr().out ) + + +def test_cli(tmp_dir, scm, dvc, mocker, capsys): + assert 0 == main(["diff"]) + assert not capsys.readouterr().out + + diff = { + "added": [{"filename": "file", "checksum": digest("text")}], + "deleted": [], + "modified": [], + } + mocker.patch("dvc.repo.Repo.diff", return_value=diff) + assert 0 == main(["diff"]) + assert ( + "{green}Added{nc}:\n" + " file\n".format(green=colorama.Fore.GREEN, nc=colorama.Fore.RESET) + == capsys.readouterr().out + ) + + diff = { + "added": [], + "deleted": [{"filename": "file", "checksum": digest("text")}], + "modified": [ + {"filename": "foo", "checksum": digest("foo")}, + {"filename": "bar", "checksum": digest("bar")}, + ], + } + mocker.patch("dvc.repo.Repo.diff", return_value=diff) + assert 0 == main(["diff"]) + assert ( + "{red}Deleted{nc}:\n" + " file\n" + "\n" + "{yellow}Modified{nc}:\n" + " bar\n" + " foo\n".format( + red=colorama.Fore.RED, + yellow=colorama.Fore.YELLOW, + nc=colorama.Fore.RESET, + ) + == capsys.readouterr().out + ) From fe4eba5941334417a340b4c38cceef1f4d0eb59f Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Sun, 26 Jan 2020 17:43:41 -0600 Subject: [PATCH 13/35] diff: handle exceptions --- dvc/command/diff.py | 2 +- dvc/repo/diff.py | 5 +++++ dvc/scm/base.py | 2 +- dvc/scm/git/__init__.py | 7 ++++++- tests/func/test_diff.py | 9 +++++++-- 5 files changed, 20 insertions(+), 5 deletions(-) diff --git a/dvc/command/diff.py b/dvc/command/diff.py index 9e43709b4e..e68c269acf 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -69,7 +69,7 @@ def run(self): print(self._format(diff)) except DvcException: - logger.exception("failed to get 'diff {}'") + logger.exception("failed to get diff") return 1 return 0 diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index a1ec334c03..89676f24b1 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -1,7 +1,9 @@ import collections import os +from dvc.exceptions import DvcException from dvc.repo import locked +from dvc.scm.git import Git Diffable = collections.namedtuple("Diffable", "filename, checksum") @@ -54,6 +56,9 @@ def diff(self, a_ref="HEAD", b_ref=None, *, target=None): the concept of `index`, but it keeps the same interface, thus, `dvc diff` would be the same as `dvc diff HEAD`. """ + if type(self.scm) is not Git: + raise DvcException("only supported for Git repositories") + outs = {} for branch in self.brancher(revs=[a_ref, b_ref]): diff --git a/dvc/scm/base.py b/dvc/scm/base.py index 412731ca89..e35c7b7d7d 100644 --- a/dvc/scm/base.py +++ b/dvc/scm/base.py @@ -29,7 +29,7 @@ def __init__(self, url, path): class RevError(SCMError): def __init__(self, url, rev): super().__init__( - "Failed to access revision '{}' for repo '{}'".format(rev, url) + "failed to access revision '{}' for repo '{}'".format(rev, url) ) diff --git a/dvc/scm/git/__init__.py b/dvc/scm/git/__init__.py index 2d334a6493..8efb2dc5de 100644 --- a/dvc/scm/git/__init__.py +++ b/dvc/scm/git/__init__.py @@ -363,7 +363,12 @@ def get_rev(self): return self.repo.git.rev_parse("HEAD") def resolve_rev(self, rev): - return self.repo.git.rev_parse(rev) + from git.exc import GitCommandError + + try: + return self.repo.git.rev_parse(rev) + except GitCommandError as exc: + raise RevError(url=self.root_dir, rev=rev) from exc def close(self): self.repo.close() diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index f7bb34d086..cd58870003 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -6,6 +6,7 @@ import colorama from dvc.main import main +from dvc.exceptions import DvcException def digest(text): @@ -15,7 +16,8 @@ def digest(text): def test_no_scm(tmp_dir, dvc): tmp_dir.dvc_gen("file", "text") - pytest.skip("TODO: define behavior, should it fail?") + with pytest.raises(DvcException, match=r"only supported for Git repos"): + dvc.diff() def test_added(tmp_dir, scm, dvc): @@ -86,7 +88,10 @@ def test_refs(tmp_dir, scm, dvc): ], } - pytest.skip('TODO: test dvc.diff("missing")') + with pytest.raises( + DvcException, match=r"failed to access revision 'missing'" + ): + dvc.diff("missing") def test_target(tmp_dir, scm, dvc): From 6ef90f5acf596a1b194ec1347a3c2b5d9a44a3e3 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Sun, 26 Jan 2020 20:21:22 -0600 Subject: [PATCH 14/35] diff: add option to display checksums --- dvc/command/diff.py | 44 ++++++++++++++++++++++++++++++++++------- tests/func/test_diff.py | 23 +++++++++++---------- 2 files changed, 50 insertions(+), 17 deletions(-) diff --git a/dvc/command/diff.py b/dvc/command/diff.py index e68c269acf..8cb5d5c237 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -13,7 +13,7 @@ class CmdDiff(CmdBase): @staticmethod - def _format(diff): + def _format(diff, include_checksums=False): """ Given a diff structure, generate a string of filenames separated by new lines and grouped together by their state. @@ -27,8 +27,22 @@ def _format(diff): dir/ dir/1 + An example of a diff formatted when `include_checksums` is truthy: + + Added: + d3b07384 foo + + Modified: + c157a790..f98bf6f1 bar + If a group has no entries, it won't be included in the result. """ + + def _digest(checksum): + if type(checksum) is str: + return checksum[0:8] + return "{}..{}".format(checksum["old"][0:8], checksum["new"][0:8]) + colors = { "added": colorama.Fore.GREEN, "modified": colorama.Fore.YELLOW, @@ -37,16 +51,25 @@ def _format(diff): groups = [] - for key, values in diff.items(): - if not values: + for state, entries in diff.items(): + if not entries: continue - entries = sorted(" " + entry["filename"] for entry in values) + entries = sorted(tuple(entry.values()) for entry in entries) + entries = [ + "{space}{checksum}{separator}{filename}".format( + space=" ", + checksum=_digest(checksum) if include_checksums else "", + separator=" " if include_checksums else "", + filename=filename, + ) + for filename, checksum in entries + ] groups.append( "{color}{header}{nc}:\n{entries}".format( - color=colors[key], - header=key.capitalize(), + color=colors[state], + header=state.capitalize(), nc=colorama.Fore.RESET, entries="\n".join(entries), ) @@ -59,6 +82,7 @@ def run(self): diff = self.repo.diff( self.args.a_ref, self.args.b_ref, target=self.args.target ) + if not any(diff.values()): return 0 @@ -66,7 +90,7 @@ def run(self): print(json.dumps(diff)) return 0 - print(self._format(diff)) + print(self._format(diff, include_checksums=self.args.checksums)) except DvcException: logger.exception("failed to get diff") @@ -116,4 +140,10 @@ def add_parser(subparsers, parent_parser): action="store_true", default=False, ) + diff_parser.add_argument( + "--checksums", + help=("Display checksums of each entry"), + action="store_true", + default=False, + ) diff_parser.set_defaults(func=CmdDiff) diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index cd58870003..24461aec24 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -217,32 +217,35 @@ def test_cli(tmp_dir, scm, dvc, mocker, capsys): } mocker.patch("dvc.repo.Repo.diff", return_value=diff) assert 0 == main(["diff"]) - assert ( + assert capsys.readouterr().out == ( "{green}Added{nc}:\n" " file\n".format(green=colorama.Fore.GREEN, nc=colorama.Fore.RESET) - == capsys.readouterr().out ) diff = { "added": [], - "deleted": [{"filename": "file", "checksum": digest("text")}], - "modified": [ + "deleted": [ {"filename": "foo", "checksum": digest("foo")}, {"filename": "bar", "checksum": digest("bar")}, ], + "modified": [ + { + "filename": "file", + "checksum": {"old": digest("first"), "new": digest("second")}, + } + ], } mocker.patch("dvc.repo.Repo.diff", return_value=diff) - assert 0 == main(["diff"]) - assert ( + assert 0 == main(["diff", "--checksums"]) + assert capsys.readouterr().out == ( "{red}Deleted{nc}:\n" - " file\n" + " 37b51d19 bar\n" + " acbd18db foo\n" "\n" "{yellow}Modified{nc}:\n" - " bar\n" - " foo\n".format( + " 8b04d5e3..a9f0e61a file\n".format( red=colorama.Fore.RED, yellow=colorama.Fore.YELLOW, nc=colorama.Fore.RESET, ) - == capsys.readouterr().out ) From bb22acd4c96d636404283d5dab4882e65af92790 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Sun, 26 Jan 2020 22:45:35 -0600 Subject: [PATCH 15/35] diff: enable using diff when there is no cache --- tests/func/test_diff.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index 24461aec24..dd2f1009bc 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -2,11 +2,13 @@ import json import os import pytest +import shutil import colorama -from dvc.main import main +from dvc.compat import fspath from dvc.exceptions import DvcException +from dvc.main import main def digest(text): @@ -23,33 +25,40 @@ def test_no_scm(tmp_dir, dvc): def test_added(tmp_dir, scm, dvc): tmp_dir.dvc_gen("file", "text") - result = { + assert dvc.diff() == { "added": [{"filename": "file", "checksum": digest("text")}], "deleted": [], "modified": [], } - assert result == dvc.diff() + +def test_no_cache_entry(tmp_dir, scm, dvc): + tmp_dir.dvc_gen("file", "text") + shutil.rmtree(fspath(tmp_dir / ".dvc" / "cache")) + + assert dvc.diff() == { + "added": [{"filename": "file", "checksum": digest("text")}], + "deleted": [], + "modified": [], + } def test_deleted(tmp_dir, scm, dvc): tmp_dir.dvc_gen("file", "text", commit="add file") (tmp_dir / "file.dvc").unlink() - result = { + assert dvc.diff() == { "added": [], "deleted": [{"filename": "file", "checksum": digest("text")}], "modified": [], } - assert result == dvc.diff() - def test_modified(tmp_dir, scm, dvc): tmp_dir.dvc_gen("file", "first", commit="first version") tmp_dir.dvc_gen("file", "second") - result = { + assert dvc.diff() == { "added": [], "deleted": [], "modified": [ @@ -60,8 +69,6 @@ def test_modified(tmp_dir, scm, dvc): ], } - assert result == dvc.diff() - def test_refs(tmp_dir, scm, dvc): tmp_dir.dvc_gen("file", "first", commit="first version") From 27ada0210dab39af0d32b094504404940256f9f5 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Sun, 26 Jan 2020 22:51:29 -0600 Subject: [PATCH 16/35] scripts: update completion scripts for `diff` --- dvc/command/diff.py | 2 +- scripts/completion/dvc.bash | 2 +- scripts/completion/dvc.zsh | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/dvc/command/diff.py b/dvc/command/diff.py index 8cb5d5c237..36e9e51980 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -142,7 +142,7 @@ def add_parser(subparsers, parent_parser): ) diff_parser.add_argument( "--checksums", - help=("Display checksums of each entry"), + help=("Display checksums for each entry"), action="store_true", default=False, ) diff --git a/scripts/completion/dvc.bash b/scripts/completion/dvc.bash index 2af965a67d..128a64326f 100644 --- a/scripts/completion/dvc.bash +++ b/scripts/completion/dvc.bash @@ -22,7 +22,7 @@ _dvc_checkout='-d --with-deps -R --recursive -f --force --relink $(compgen -G *. _dvc_commit='-f --force -d --with-deps -R --recursive $(compgen -G *.dvc)' _dvc_config='-u --unset --local --system --global' _dvc_destroy='-f --force' -_dvc_diff='-t --target' +_dvc_diff='-t --target --json --checksums' _dvc_fetch='-j --jobs -r --remote -a --all-branches -T --all-tags -d --with-deps -R --recursive $(compgen -G *.dvc)' _dvc_get_url='' _dvc_get='-o --out --rev --show-url' diff --git a/scripts/completion/dvc.zsh b/scripts/completion/dvc.zsh index 32bdef47ff..be47469d5e 100644 --- a/scripts/completion/dvc.zsh +++ b/scripts/completion/dvc.zsh @@ -98,6 +98,9 @@ _dvc_destroy=( _dvc_diff=( {-t,--target}"[Source path to a data file or directory.]:Target files:" + "--json[Format the output into a JSON]" + "--checksums[Display checksums for each entry]" + {1,2}":Git revision (e.g. branch, tag, SHA):" ) _dvc_fetch=( From e6a957a4261c36886f88bb72d7e069b3e31309af Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Sun, 26 Jan 2020 23:24:24 -0600 Subject: [PATCH 17/35] tests/diff: add directory to no_cache_entry --- tests/func/test_diff.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index dd2f1009bc..04719fd4e4 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -33,13 +33,29 @@ def test_added(tmp_dir, scm, dvc): def test_no_cache_entry(tmp_dir, scm, dvc): - tmp_dir.dvc_gen("file", "text") + tmp_dir.dvc_gen("file", "first", commit="add a file") + + tmp_dir.dvc_gen({"dir": {"1": "1", "2": "2"}}) + tmp_dir.dvc_gen("file", "second") + shutil.rmtree(fspath(tmp_dir / ".dvc" / "cache")) + (tmp_dir / ".dvc" / "state").unlink() + + dir_checksum = "5fb6b29836c388e093ca0715c872fe2a.dir" assert dvc.diff() == { - "added": [{"filename": "file", "checksum": digest("text")}], + "added": [ + {"filename": "dir/", "checksum": dir_checksum}, + {"filename": "dir/1", "checksum": digest("1")}, + {"filename": "dir/2", "checksum": digest("2")}, + ], "deleted": [], - "modified": [], + "modified": [ + { + "filename": "file", + "checksum": {"old": digest("first"), "new": digest("second")}, + } + ], } From 10c0ff22d67e11997170986b710da477796a2728 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Mon, 27 Jan 2020 01:40:24 -0600 Subject: [PATCH 18/35] diff: remove extra sorting --- dvc/command/diff.py | 2 +- tests/func/test_diff.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dvc/command/diff.py b/dvc/command/diff.py index 36e9e51980..2b1de552c9 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -55,7 +55,7 @@ def _digest(checksum): if not entries: continue - entries = sorted(tuple(entry.values()) for entry in entries) + entries = [tuple(entry.values()) for entry in entries] entries = [ "{space}{checksum}{separator}{filename}".format( space=" ", diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index 04719fd4e4..39014540c9 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -248,8 +248,8 @@ def test_cli(tmp_dir, scm, dvc, mocker, capsys): diff = { "added": [], "deleted": [ - {"filename": "foo", "checksum": digest("foo")}, {"filename": "bar", "checksum": digest("bar")}, + {"filename": "foo", "checksum": digest("foo")}, ], "modified": [ { From 9d14acd87fac7958d9c526ac76d9db481733383a Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Mon, 27 Jan 2020 11:48:02 -0600 Subject: [PATCH 19/35] :nail_care: remove excesive parenthesis --- dvc/command/diff.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dvc/command/diff.py b/dvc/command/diff.py index 2b1de552c9..bec6039d2a 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -136,13 +136,13 @@ def add_parser(subparsers, parent_parser): ) diff_parser.add_argument( "--json", - help=("Format the output into a JSON"), + help="Format the output into a JSON", action="store_true", default=False, ) diff_parser.add_argument( "--checksums", - help=("Display checksums for each entry"), + help="Display checksums for each entry", action="store_true", default=False, ) From 9171b9e0adc40b209510189567d2b71b9f923977 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Mon, 27 Jan 2020 12:07:28 -0600 Subject: [PATCH 20/35] :nail_care: add reference explaining git revisions --- tests/func/test_diff.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index 39014540c9..d9f918fc15 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -156,6 +156,10 @@ def test_directories(tmp_dir, scm, dvc): scm.add("dir.dvc") scm.commit("delete a file") + + # The ":/" format is a way to specify revisions by commit message: + # https://git-scm.com/docs/git-rev-parse + # assert dvc.diff(":/init", ":/directory") == { "added": [ { From 2370cb12b3b526f15e6c35a19599629e841c1a9c Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Mon, 27 Jan 2020 12:32:27 -0600 Subject: [PATCH 21/35] diff: use logger instead of print --- dvc/command/diff.py | 7 ++-- tests/func/test_diff.py | 73 --------------------------------- tests/unit/command/test_diff.py | 59 ++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 76 deletions(-) create mode 100644 tests/unit/command/test_diff.py diff --git a/dvc/command/diff.py b/dvc/command/diff.py index bec6039d2a..12fde92648 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -87,10 +87,11 @@ def run(self): return 0 if self.args.json: - print(json.dumps(diff)) - return 0 + res = json.dumps(diff) + else: + res = self._format(diff, include_checksums=self.args.checksums) - print(self._format(diff, include_checksums=self.args.checksums)) + logger.info(res) except DvcException: logger.exception("failed to get diff") diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index d9f918fc15..e40ce3e22d 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -1,14 +1,10 @@ import hashlib -import json import os import pytest import shutil -import colorama - from dvc.compat import fspath from dvc.exceptions import DvcException -from dvc.main import main def digest(text): @@ -156,7 +152,6 @@ def test_directories(tmp_dir, scm, dvc): scm.add("dir.dvc") scm.commit("delete a file") - # The ":/" format is a way to specify revisions by commit message: # https://git-scm.com/docs/git-rev-parse # @@ -208,71 +203,3 @@ def test_directories(tmp_dir, scm, dvc): } ], } - - -def test_json(tmp_dir, scm, dvc, mocker, capsys): - assert 0 == main(["diff", "--json"]) - assert not capsys.readouterr().out - - diff = { - "added": [{"filename": "file", "checksum": digest("text")}], - "deleted": [], - "modified": [], - } - mocker.patch("dvc.repo.Repo.diff", return_value=diff) - assert 0 == main(["diff", "--json"]) - assert ( - json.dumps( - { - "added": [{"filename": "file", "checksum": digest("text")}], - "deleted": [], - "modified": [], - } - ) - in capsys.readouterr().out - ) - - -def test_cli(tmp_dir, scm, dvc, mocker, capsys): - assert 0 == main(["diff"]) - assert not capsys.readouterr().out - - diff = { - "added": [{"filename": "file", "checksum": digest("text")}], - "deleted": [], - "modified": [], - } - mocker.patch("dvc.repo.Repo.diff", return_value=diff) - assert 0 == main(["diff"]) - assert capsys.readouterr().out == ( - "{green}Added{nc}:\n" - " file\n".format(green=colorama.Fore.GREEN, nc=colorama.Fore.RESET) - ) - - diff = { - "added": [], - "deleted": [ - {"filename": "bar", "checksum": digest("bar")}, - {"filename": "foo", "checksum": digest("foo")}, - ], - "modified": [ - { - "filename": "file", - "checksum": {"old": digest("first"), "new": digest("second")}, - } - ], - } - mocker.patch("dvc.repo.Repo.diff", return_value=diff) - assert 0 == main(["diff", "--checksums"]) - assert capsys.readouterr().out == ( - "{red}Deleted{nc}:\n" - " 37b51d19 bar\n" - " acbd18db foo\n" - "\n" - "{yellow}Modified{nc}:\n" - " 8b04d5e3..a9f0e61a file\n".format( - red=colorama.Fore.RED, - yellow=colorama.Fore.YELLOW, - nc=colorama.Fore.RESET, - ) - ) diff --git a/tests/unit/command/test_diff.py b/tests/unit/command/test_diff.py new file mode 100644 index 0000000000..8454118f2e --- /dev/null +++ b/tests/unit/command/test_diff.py @@ -0,0 +1,59 @@ +import json + +from dvc.cli import parse_args + + +def test_default(mocker, caplog): + args = parse_args(["diff"]) + cmd = args.func(args) + diff = { + "added": [{"filename": "file", "checksum": "00000000"}], + "deleted": [], + "modified": [], + } + mocker.patch("dvc.repo.Repo.diff", return_value=diff) + + assert 0 == cmd.run() + assert "Added:\n file\n" in caplog.text + + +def test_checksums(mocker, caplog): + args = parse_args(["diff", "--checksums"]) + cmd = args.func(args) + diff = { + "added": [], + "deleted": [ + {"filename": "bar", "checksum": "00000000"}, + {"filename": "foo", "checksum": "11111111"}, + ], + "modified": [ + { + "filename": "file", + "checksum": {"old": "AAAAAAAA", "new": "BBBBBBBB"}, + } + ], + } + mocker.patch("dvc.repo.Repo.diff", return_value=diff) + assert 0 == cmd.run() + assert ( + "Deleted:\n" + " 00000000 bar\n" + " 11111111 foo\n" + "\n" + "Modified:\n" + " AAAAAAAA..BBBBBBBB file\n" + ) in caplog.text + + +def test_json(mocker, caplog): + args = parse_args(["diff", "--json"]) + cmd = args.func(args) + diff = { + "added": [{"filename": "file", "checksum": "00000000"}], + "deleted": [], + "modified": [], + } + mocker.patch("dvc.repo.Repo.diff", return_value=diff) + + assert 0 == cmd.run() + assert json.dumps(diff) in caplog.text From fced1718ccebe1056ed94fa79599c5e67982b177 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Mon, 27 Jan 2020 12:53:24 -0600 Subject: [PATCH 22/35] diff: --json -> --show-json --- dvc/command/diff.py | 4 ++-- scripts/completion/dvc.bash | 2 +- scripts/completion/dvc.zsh | 2 +- tests/unit/command/test_diff.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dvc/command/diff.py b/dvc/command/diff.py index 12fde92648..ca316d116b 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -86,7 +86,7 @@ def run(self): if not any(diff.values()): return 0 - if self.args.json: + if self.args.show_json: res = json.dumps(diff) else: res = self._format(diff, include_checksums=self.args.checksums) @@ -136,7 +136,7 @@ def add_parser(subparsers, parent_parser): ), ) diff_parser.add_argument( - "--json", + "--show-json", help="Format the output into a JSON", action="store_true", default=False, diff --git a/scripts/completion/dvc.bash b/scripts/completion/dvc.bash index 128a64326f..b648390296 100644 --- a/scripts/completion/dvc.bash +++ b/scripts/completion/dvc.bash @@ -22,7 +22,7 @@ _dvc_checkout='-d --with-deps -R --recursive -f --force --relink $(compgen -G *. _dvc_commit='-f --force -d --with-deps -R --recursive $(compgen -G *.dvc)' _dvc_config='-u --unset --local --system --global' _dvc_destroy='-f --force' -_dvc_diff='-t --target --json --checksums' +_dvc_diff='-t --target --show-json --checksums' _dvc_fetch='-j --jobs -r --remote -a --all-branches -T --all-tags -d --with-deps -R --recursive $(compgen -G *.dvc)' _dvc_get_url='' _dvc_get='-o --out --rev --show-url' diff --git a/scripts/completion/dvc.zsh b/scripts/completion/dvc.zsh index be47469d5e..08ed4f6071 100644 --- a/scripts/completion/dvc.zsh +++ b/scripts/completion/dvc.zsh @@ -98,7 +98,7 @@ _dvc_destroy=( _dvc_diff=( {-t,--target}"[Source path to a data file or directory.]:Target files:" - "--json[Format the output into a JSON]" + "--show-json[Format the output into a JSON]" "--checksums[Display checksums for each entry]" {1,2}":Git revision (e.g. branch, tag, SHA):" ) diff --git a/tests/unit/command/test_diff.py b/tests/unit/command/test_diff.py index 8454118f2e..80ac06e4b1 100644 --- a/tests/unit/command/test_diff.py +++ b/tests/unit/command/test_diff.py @@ -46,7 +46,7 @@ def test_checksums(mocker, caplog): def test_json(mocker, caplog): - args = parse_args(["diff", "--json"]) + args = parse_args(["diff", "--show-json"]) cmd = args.func(args) diff = { "added": [{"filename": "file", "checksum": "00000000"}], From 423a6163bc61d302727e85889ae3cf3c0d45913d Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Mon, 27 Jan 2020 14:06:24 -0600 Subject: [PATCH 23/35] diff: make --checksums work with --json-show --- dvc/command/diff.py | 56 ++++++++++++++++----------------- tests/unit/command/test_diff.py | 24 ++++++++++++-- 2 files changed, 48 insertions(+), 32 deletions(-) diff --git a/dvc/command/diff.py b/dvc/command/diff.py index ca316d116b..026ea13724 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -13,7 +13,7 @@ class CmdDiff(CmdBase): @staticmethod - def _format(diff, include_checksums=False): + def _format(diff): """ Given a diff structure, generate a string of filenames separated by new lines and grouped together by their state. @@ -27,7 +27,7 @@ def _format(diff, include_checksums=False): dir/ dir/1 - An example of a diff formatted when `include_checksums` is truthy: + An example of a diff formatted when entries contain checksums: Added: d3b07384 foo @@ -39,6 +39,8 @@ def _format(diff, include_checksums=False): """ def _digest(checksum): + if not checksum: + return "" if type(checksum) is str: return checksum[0:8] return "{}..{}".format(checksum["old"][0:8], checksum["new"][0:8]) @@ -49,33 +51,24 @@ def _digest(checksum): "deleted": colorama.Fore.RED, } - groups = [] - - for state, entries in diff.items(): - if not entries: - continue - - entries = [tuple(entry.values()) for entry in entries] - entries = [ - "{space}{checksum}{separator}{filename}".format( - space=" ", - checksum=_digest(checksum) if include_checksums else "", - separator=" " if include_checksums else "", - filename=filename, - ) - for filename, checksum in entries - ] - - groups.append( - "{color}{header}{nc}:\n{entries}".format( - color=colors[state], - header=state.capitalize(), - nc=colorama.Fore.RESET, - entries="\n".join(entries), - ) + return "\n\n".join( + "{color}{header}{nc}:\n{entries}".format( + color=colors[state], + header=state.capitalize(), + nc=colorama.Fore.RESET, + entries="\n".join( + "{space}{checksum}{separator}{filename}".format( + space=" ", + checksum=_digest(entry.get("checksum")), + separator=" " if entry.get("checksum") else "", + filename=entry["filename"], + ) + for entry in entries + ), ) - - return "\n\n".join(groups) + for state, entries in diff.items() + if entries + ) def run(self): try: @@ -86,10 +79,15 @@ def run(self): if not any(diff.values()): return 0 + if not self.args.checksums: + for _, entries in diff.items(): + for entry in entries: + del entry["checksum"] + if self.args.show_json: res = json.dumps(diff) else: - res = self._format(diff, include_checksums=self.args.checksums) + res = self._format(diff) logger.info(res) diff --git a/tests/unit/command/test_diff.py b/tests/unit/command/test_diff.py index 80ac06e4b1..fb29e7e76e 100644 --- a/tests/unit/command/test_diff.py +++ b/tests/unit/command/test_diff.py @@ -1,5 +1,3 @@ -import json - from dvc.cli import parse_args @@ -56,4 +54,24 @@ def test_json(mocker, caplog): mocker.patch("dvc.repo.Repo.diff", return_value=diff) assert 0 == cmd.run() - assert json.dumps(diff) in caplog.text + assert ( + '{"added": [{"filename": "file"}], "deleted": [], "modified": []}' + in caplog.text + ) + + +def test_json_checksums(mocker, caplog): + args = parse_args(["diff", "--show-json", "--checksums"]) + cmd = args.func(args) + diff = { + "added": [{"filename": "file", "checksum": "00000000"}], + "deleted": [], + "modified": [], + } + mocker.patch("dvc.repo.Repo.diff", return_value=diff) + + assert 0 == cmd.run() + assert ( + '{"added": [{"filename": "file", "checksum": "00000000"}], ' + '"deleted": [], "modified": []}' in caplog.text + ) From a12d81fd0a105f5aae25268b0dd1d36d1c9c345e Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Mon, 27 Jan 2020 14:13:54 -0600 Subject: [PATCH 24/35] tests/diff: fix windows compat issues --- tests/func/test_diff.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index e40ce3e22d..da1307fd29 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -41,9 +41,9 @@ def test_no_cache_entry(tmp_dir, scm, dvc): assert dvc.diff() == { "added": [ - {"filename": "dir/", "checksum": dir_checksum}, - {"filename": "dir/1", "checksum": digest("1")}, - {"filename": "dir/2", "checksum": digest("2")}, + {"filename": os.path.join("dir", ""), "checksum": dir_checksum}, + {"filename": os.path.join("dir", "1"), "checksum": digest("1")}, + {"filename": os.path.join("dir", "2"), "checksum": digest("2")}, ], "deleted": [], "modified": [ From 3c6b52bb91562f1b240b6e9aa7d7e08dd1abef7e Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Mon, 27 Jan 2020 14:26:46 -0600 Subject: [PATCH 25/35] diff: improve RevError message --- dvc/scm/base.py | 5 +---- dvc/scm/git/__init__.py | 10 +++++++--- tests/func/test_diff.py | 4 +--- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/dvc/scm/base.py b/dvc/scm/base.py index e35c7b7d7d..bbad771a01 100644 --- a/dvc/scm/base.py +++ b/dvc/scm/base.py @@ -27,10 +27,7 @@ def __init__(self, url, path): class RevError(SCMError): - def __init__(self, url, rev): - super().__init__( - "failed to access revision '{}' for repo '{}'".format(rev, url) - ) + pass class Base(object): diff --git a/dvc/scm/git/__init__.py b/dvc/scm/git/__init__.py index 8efb2dc5de..9ae9e1514a 100644 --- a/dvc/scm/git/__init__.py +++ b/dvc/scm/git/__init__.py @@ -94,7 +94,11 @@ def clone(url, to_path, rev=None): try: repo.checkout(rev) except git.exc.GitCommandError as exc: - raise RevError(url, rev) from exc + raise RevError( + "failed to access revision '{}' for repo '{}'".format( + rev, url + ) + ) from exc return repo @@ -367,8 +371,8 @@ def resolve_rev(self, rev): try: return self.repo.git.rev_parse(rev) - except GitCommandError as exc: - raise RevError(url=self.root_dir, rev=rev) from exc + except GitCommandError: + raise RevError("unknown Git revision '{}'".format(rev)) def close(self): self.repo.close() diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index da1307fd29..1ffb1c6990 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -107,9 +107,7 @@ def test_refs(tmp_dir, scm, dvc): ], } - with pytest.raises( - DvcException, match=r"failed to access revision 'missing'" - ): + with pytest.raises(DvcException, match=r"unknown Git revision 'missing'"): dvc.diff("missing") From 7f6324e2987354324cc4b2f19ffc6a88c4371929 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Mon, 27 Jan 2020 16:06:46 -0600 Subject: [PATCH 26/35] diff: remove --target --- dvc/command/diff.py | 13 +------------ dvc/repo/diff.py | 5 +---- scripts/completion/dvc.bash | 2 +- scripts/completion/dvc.zsh | 1 - tests/func/test_diff.py | 29 ----------------------------- 5 files changed, 3 insertions(+), 47 deletions(-) diff --git a/dvc/command/diff.py b/dvc/command/diff.py index 026ea13724..42113f9003 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -72,9 +72,7 @@ def _digest(checksum): def run(self): try: - diff = self.repo.diff( - self.args.a_ref, self.args.b_ref, target=self.args.target - ) + diff = self.repo.diff(self.args.a_ref, self.args.b_ref) if not any(diff.values()): return 0 @@ -124,15 +122,6 @@ def add_parser(subparsers, parent_parser): ), nargs="?", ) - diff_parser.add_argument( - "-t", - "--target", - help=( - "Source path to a data file or directory. Default None. " - "If not specified, compares all files and directories " - "that are under DVC control in the current working space." - ), - ) diff_parser.add_argument( "--show-json", help="Format the output into a JSON", diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index 89676f24b1..1945bd1f5f 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -46,12 +46,10 @@ def _diffables_from_output(output): @locked -def diff(self, a_ref="HEAD", b_ref=None, *, target=None): +def diff(self, a_ref="HEAD", b_ref=None): """ By default, it compares the working tree with the last commit's tree. - When a `target` path is given, it only shows that file's comparison. - This implementation differs from `git diff` since DVC doesn't have the concept of `index`, but it keeps the same interface, thus, `dvc diff` would be the same as `dvc diff HEAD`. @@ -67,7 +65,6 @@ def diff(self, a_ref="HEAD", b_ref=None, *, target=None): for stage in self.stages for out in stage.outs for diffable in _diffables_from_output(out) - if not target or target == str(out) ) old = outs[a_ref] diff --git a/scripts/completion/dvc.bash b/scripts/completion/dvc.bash index b648390296..8c2d3b0612 100644 --- a/scripts/completion/dvc.bash +++ b/scripts/completion/dvc.bash @@ -22,7 +22,7 @@ _dvc_checkout='-d --with-deps -R --recursive -f --force --relink $(compgen -G *. _dvc_commit='-f --force -d --with-deps -R --recursive $(compgen -G *.dvc)' _dvc_config='-u --unset --local --system --global' _dvc_destroy='-f --force' -_dvc_diff='-t --target --show-json --checksums' +_dvc_diff='-t --show-json --checksums' _dvc_fetch='-j --jobs -r --remote -a --all-branches -T --all-tags -d --with-deps -R --recursive $(compgen -G *.dvc)' _dvc_get_url='' _dvc_get='-o --out --rev --show-url' diff --git a/scripts/completion/dvc.zsh b/scripts/completion/dvc.zsh index 08ed4f6071..03f9fa69c1 100644 --- a/scripts/completion/dvc.zsh +++ b/scripts/completion/dvc.zsh @@ -97,7 +97,6 @@ _dvc_destroy=( ) _dvc_diff=( - {-t,--target}"[Source path to a data file or directory.]:Target files:" "--show-json[Format the output into a JSON]" "--checksums[Display checksums for each entry]" {1,2}":Git revision (e.g. branch, tag, SHA):" diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index 1ffb1c6990..caaa7e074e 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -111,35 +111,6 @@ def test_refs(tmp_dir, scm, dvc): dvc.diff("missing") -def test_target(tmp_dir, scm, dvc): - tmp_dir.dvc_gen("foo", "foo") - tmp_dir.dvc_gen("bar", "bar") - scm.add([".gitignore", "foo.dvc", "bar.dvc"]) - scm.commit("lowercase") - - tmp_dir.dvc_gen("foo", "FOO") - tmp_dir.dvc_gen("bar", "BAR") - scm.add(["foo.dvc", "bar.dvc"]) - scm.commit("uppercase") - - assert dvc.diff("HEAD~1", target="foo") == { - "added": [], - "deleted": [], - "modified": [ - { - "filename": "foo", - "checksum": {"old": digest("foo"), "new": digest("FOO")}, - } - ], - } - - assert dvc.diff("HEAD~1", target="missing") == { - "added": [], - "deleted": [], - "modified": [], - } - - def test_directories(tmp_dir, scm, dvc): tmp_dir.dvc_gen({"dir": {"1": "1", "2": "2"}}, commit="add a directory") tmp_dir.dvc_gen({"dir": {"3": "3"}}, commit="add a file") From f41ceb51da91351db9158fb13c7bdb719c9f6b6f Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Mon, 27 Jan 2020 17:03:50 -0600 Subject: [PATCH 27/35] tests: adjust doc --- tests/func/test_diff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index caaa7e074e..446f7f246d 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -122,7 +122,7 @@ def test_directories(tmp_dir, scm, dvc): scm.commit("delete a file") # The ":/" format is a way to specify revisions by commit message: - # https://git-scm.com/docs/git-rev-parse + # https://git-scm.com/docs/revisions # assert dvc.diff(":/init", ":/directory") == { "added": [ From 76f5514a92c8139979ed467d2413be6d0da352b5 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Wed, 29 Jan 2020 13:38:03 -0600 Subject: [PATCH 28/35] refactor --- dvc/repo/diff.py | 84 +++++++++++++++++++++--------------------------- 1 file changed, 37 insertions(+), 47 deletions(-) diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index 1945bd1f5f..e6735250e6 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -6,45 +6,6 @@ from dvc.scm.git import Git -Diffable = collections.namedtuple("Diffable", "filename, checksum") -Diffable.__doc__ = "Common interface to compare outputs." -Diffable.__eq__ = lambda self, other: self.filename == other.filename -Diffable.__hash__ = lambda self: hash(self.filename) - - -def _diffables_from_output(output): - """ - Transform an output into a list of Diffable objects so we can - compare them lately. - - Unpack directories to include entries' Diffables. - - To help distinguish between an a directory output and a file output, - the former one will come with a trailing slash in the filename: - - directory: "data/" - file: "data" - - You can also rely on the checksum to tell whether it was computed for - a file or a directory as a whole. - """ - if output.is_dir_checksum: - return [ - Diffable( - filename=os.path.join(str(output), ""), - checksum=output.checksum, - ) - ] + [ - Diffable( - filename=str(output.path_info / entry["relpath"]), - checksum=entry["md5"], - ) - for entry in output.dir_cache - ] - - return [Diffable(filename=str(output), checksum=output.checksum)] - - @locked def diff(self, a_ref="HEAD", b_ref=None): """ @@ -57,15 +18,44 @@ def diff(self, a_ref="HEAD", b_ref=None): if type(self.scm) is not Git: raise DvcException("only supported for Git repositories") - outs = {} - for branch in self.brancher(revs=[a_ref, b_ref]): - outs[branch] = set( - diffable - for stage in self.stages - for out in stage.outs - for diffable in _diffables_from_output(out) - ) + def _checksums_by_filenames(): + """ + A dictionary of checksums addressed by filenames collected from + the current tree outputs. + + Unpack directories to include their entries + + To help distinguish between a directory and a file output, + the former one will come with a trailing slash in the filename: + + directory: "data/" + file: "data" + """ + result = {} + + for stage in self.stages: + for output in stage.outs: + if not output.is_dir_checksum: + result.update({str(output): output.checksum}) + continue + + result.update({os.path.join(str(output), ""): output.checksum}) + + for entry in output.dir_cache: + filename = str(output.path_info / entry["relpath"]) + result.update({filename: entry["md5"]}) + + return result + + + def _compare_trees(a_ref, b_ref): + working_tree = self.tree + + a_tree = self.scm.get_tree(self.scm.resolve_rev(a_ref)) + b_tree = self.scm.get_tree(self.scm.resolve_rev(b_ref)) if b_tree else working_tree + + breakpoint() old = outs[a_ref] new = outs[b_ref or "working tree"] From 8e9d8145cd73dda1aec47e2c9e19e7b56add537c Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Wed, 29 Jan 2020 17:40:59 -0600 Subject: [PATCH 29/35] diff: remove deadcode, replace diffable with dicts --- dvc/command/diff.py | 11 +++-- dvc/repo/brancher.py | 2 - dvc/repo/diff.py | 72 +++++++++++++++------------------ dvc/scm/git/__init__.py | 62 +--------------------------- tests/func/test_diff.py | 36 ++++++++--------- tests/unit/command/test_diff.py | 16 ++++---- 6 files changed, 63 insertions(+), 136 deletions(-) diff --git a/dvc/command/diff.py b/dvc/command/diff.py index 42113f9003..3c6b2f447f 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -15,7 +15,7 @@ class CmdDiff(CmdBase): @staticmethod def _format(diff): """ - Given a diff structure, generate a string of filenames separated + Given a diff structure, generate a string of paths separated by new lines and grouped together by their state. A group's header is colored and its entries are sorted to enhance @@ -57,17 +57,16 @@ def _digest(checksum): header=state.capitalize(), nc=colorama.Fore.RESET, entries="\n".join( - "{space}{checksum}{separator}{filename}".format( + "{space}{checksum}{separator}{path}".format( space=" ", checksum=_digest(entry.get("checksum")), separator=" " if entry.get("checksum") else "", - filename=entry["filename"], + path=entry["path"], ) - for entry in entries + for entry in diff[state] ), ) - for state, entries in diff.items() - if entries + for state in ["added", "deleted", "modified"] ) def run(self): diff --git a/dvc/repo/brancher.py b/dvc/repo/brancher.py index 9d10901905..955b135135 100644 --- a/dvc/repo/brancher.py +++ b/dvc/repo/brancher.py @@ -41,8 +41,6 @@ def brancher( # noqa: E302 if all_tags: revs.extend(scm.list_tags()) - revs = filter(None, revs) - try: if revs: for sha, names in group_by(scm.resolve_rev, revs).items(): diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index e6735250e6..b8970c7b35 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -1,4 +1,3 @@ -import collections import os from dvc.exceptions import DvcException @@ -18,16 +17,15 @@ def diff(self, a_ref="HEAD", b_ref=None): if type(self.scm) is not Git: raise DvcException("only supported for Git repositories") - - def _checksums_by_filenames(): + def _paths_checksums(): """ - A dictionary of checksums addressed by filenames collected from + A dictionary of checksums addressed by relpaths collected from the current tree outputs. Unpack directories to include their entries To help distinguish between a directory and a file output, - the former one will come with a trailing slash in the filename: + the former one will come with a trailing slash in the path: directory: "data/" file: "data" @@ -43,42 +41,36 @@ def _checksums_by_filenames(): result.update({os.path.join(str(output), ""): output.checksum}) for entry in output.dir_cache: - filename = str(output.path_info / entry["relpath"]) - result.update({filename: entry["md5"]}) + path = str(output.path_info / entry["relpath"]) + result.update({path: entry["md5"]}) return result - - def _compare_trees(a_ref, b_ref): - working_tree = self.tree - - a_tree = self.scm.get_tree(self.scm.resolve_rev(a_ref)) - b_tree = self.scm.get_tree(self.scm.resolve_rev(b_ref)) if b_tree else working_tree - - breakpoint() - - old = outs[a_ref] - new = outs[b_ref or "working tree"] - - added = new - old - deleted = old - new - delta = old ^ new - - result = { - "added": [entry._asdict() for entry in sorted(added)], - "deleted": [entry._asdict() for entry in sorted(deleted)], - "modified": [], + working_tree = self.tree + a_tree = self.scm.get_tree(a_ref) + b_tree = self.scm.get_tree(b_ref) if b_ref else working_tree + + try: + self.tree = a_tree + old = _paths_checksums() + + self.tree = b_tree + new = _paths_checksums() + finally: + self.tree = working_tree + + # Compare paths between the old and new tree. + # set() efficiently converts dict keys to a set + added = sorted(set(new) - set(old)) + deleted = sorted(set(old) - set(new)) + modified = sorted(set(old) & set(new)) + + return { + "added": [{"path": path, "checksum": new[path]} for path in added], + "deleted": [{"path": path, "checksum": old[path]} for path in deleted], + "modified": [ + {"path": path, "checksum": {"old": old[path], "new": new[path]}} + for path in modified + if old[path] != new[path] + ], } - - for _old, _new in zip(sorted(old - delta), sorted(new - delta)): - if _old.checksum == _new.checksum: - continue - - result["modified"].append( - { - "filename": _new.filename, - "checksum": {"old": _old.checksum, "new": _new.checksum}, - } - ) - - return result diff --git a/dvc/scm/git/__init__.py b/dvc/scm/git/__init__.py index 9ae9e1514a..47f737975f 100644 --- a/dvc/scm/git/__init__.py +++ b/dvc/scm/git/__init__.py @@ -22,13 +22,6 @@ logger = logging.getLogger(__name__) -DIFF_A_TREE = "a_tree" -DIFF_B_TREE = "b_tree" -DIFF_A_REF = "a_ref" -DIFF_B_REF = "b_ref" -DIFF_EQUAL = "equal" - - class Git(Base): """Class for managing Git.""" @@ -308,60 +301,7 @@ def belongs_to_scm(self, path): return basename == self.ignore_file or Git.GIT_DIR in path_parts def get_tree(self, rev): - return GitTree(self.repo, rev) - - def _get_diff_trees(self, a_ref, b_ref): - """Private method for getting the trees and commit hashes of 2 git - references. Requires `gitdb` module (from gitpython package). - - Args: - a_ref (str): git reference - b_ref (str): second git reference. If None, uses HEAD - - Returns: - tuple: tuple with elements: (trees, commits) - """ - from gitdb.exc import BadObject, BadName - - trees = {DIFF_A_TREE: None, DIFF_B_TREE: None} - commits = [] - if b_ref is None: - b_ref = self.repo.head.commit - try: - a_commit = self.repo.git.rev_parse(a_ref, short=True) - b_commit = self.repo.git.rev_parse(b_ref, short=True) - # See https://gitpython.readthedocs.io - # /en/2.1.11/reference.html#git.objects.base.Object.__str__ - commits.append(a_commit) - commits.append(b_commit) - trees[DIFF_A_TREE] = self.get_tree(commits[0]) - trees[DIFF_B_TREE] = self.get_tree(commits[1]) - except (BadName, BadObject) as exc: - raise SCMError("git problem") from exc - return trees, commits - - def get_diff_trees(self, a_ref, b_ref=None): - """Method for getting two repo trees between two git tag commits. - Returns the dvc hash names of changed file/directory - - Args: - a_ref (str): git reference - b_ref (str): optional second git reference, default None - - Returns: - dict: dictionary with keys: {a_ref, b_ref, equal} - or {a_ref, b_ref, a_tree, b_tree} - """ - diff_dct = {DIFF_EQUAL: False} - trees, commits = self._get_diff_trees(a_ref, b_ref) - diff_dct[DIFF_A_REF] = commits[0] - diff_dct[DIFF_B_REF] = commits[1] - if commits[0] == commits[1]: - diff_dct[DIFF_EQUAL] = True - return diff_dct - diff_dct[DIFF_A_TREE] = trees[DIFF_A_TREE] - diff_dct[DIFF_B_TREE] = trees[DIFF_B_TREE] - return diff_dct + return GitTree(self.repo, self.resolve_rev(rev)) def get_rev(self): return self.repo.git.rev_parse("HEAD") diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index 446f7f246d..7af8f3afca 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -22,7 +22,7 @@ def test_added(tmp_dir, scm, dvc): tmp_dir.dvc_gen("file", "text") assert dvc.diff() == { - "added": [{"filename": "file", "checksum": digest("text")}], + "added": [{"path": "file", "checksum": digest("text")}], "deleted": [], "modified": [], } @@ -41,14 +41,14 @@ def test_no_cache_entry(tmp_dir, scm, dvc): assert dvc.diff() == { "added": [ - {"filename": os.path.join("dir", ""), "checksum": dir_checksum}, - {"filename": os.path.join("dir", "1"), "checksum": digest("1")}, - {"filename": os.path.join("dir", "2"), "checksum": digest("2")}, + {"path": os.path.join("dir", ""), "checksum": dir_checksum}, + {"path": os.path.join("dir", "1"), "checksum": digest("1")}, + {"path": os.path.join("dir", "2"), "checksum": digest("2")}, ], "deleted": [], "modified": [ { - "filename": "file", + "path": "file", "checksum": {"old": digest("first"), "new": digest("second")}, } ], @@ -61,7 +61,7 @@ def test_deleted(tmp_dir, scm, dvc): assert dvc.diff() == { "added": [], - "deleted": [{"filename": "file", "checksum": digest("text")}], + "deleted": [{"path": "file", "checksum": digest("text")}], "modified": [], } @@ -75,7 +75,7 @@ def test_modified(tmp_dir, scm, dvc): "deleted": [], "modified": [ { - "filename": "file", + "path": "file", "checksum": {"old": digest("first"), "new": digest("second")}, } ], @@ -95,7 +95,7 @@ def test_refs(tmp_dir, scm, dvc): "added": [], "deleted": [], "modified": [ - {"filename": "file", "checksum": {"old": HEAD_1, "new": HEAD}} + {"path": "file", "checksum": {"old": HEAD_1, "new": HEAD}} ], } @@ -103,7 +103,7 @@ def test_refs(tmp_dir, scm, dvc): "added": [], "deleted": [], "modified": [ - {"filename": "file", "checksum": {"old": HEAD_2, "new": HEAD_1}} + {"path": "file", "checksum": {"old": HEAD_2, "new": HEAD_1}} ], } @@ -127,31 +127,29 @@ def test_directories(tmp_dir, scm, dvc): assert dvc.diff(":/init", ":/directory") == { "added": [ { - "filename": os.path.join("dir", ""), + "path": os.path.join("dir", ""), "checksum": "5fb6b29836c388e093ca0715c872fe2a.dir", }, - {"filename": os.path.join("dir", "1"), "checksum": digest("1")}, - {"filename": os.path.join("dir", "2"), "checksum": digest("2")}, + {"path": os.path.join("dir", "1"), "checksum": digest("1")}, + {"path": os.path.join("dir", "2"), "checksum": digest("2")}, ], "deleted": [], "modified": [], } assert dvc.diff(":/directory", ":/modify") == { - "added": [ - {"filename": os.path.join("dir", "3"), "checksum": digest("3")} - ], + "added": [{"path": os.path.join("dir", "3"), "checksum": digest("3")}], "deleted": [], "modified": [ { - "filename": os.path.join("dir", ""), + "path": os.path.join("dir", ""), "checksum": { "old": "5fb6b29836c388e093ca0715c872fe2a.dir", "new": "9b5faf37366b3370fd98e3e60ca439c1.dir", }, }, { - "filename": os.path.join("dir", "2"), + "path": os.path.join("dir", "2"), "checksum": {"old": digest("2"), "new": digest("two")}, }, ], @@ -160,11 +158,11 @@ def test_directories(tmp_dir, scm, dvc): assert dvc.diff(":/modify", ":/delete") == { "added": [], "deleted": [ - {"filename": os.path.join("dir", "2"), "checksum": digest("two")} + {"path": os.path.join("dir", "2"), "checksum": digest("two")} ], "modified": [ { - "filename": os.path.join("dir", ""), + "path": os.path.join("dir", ""), "checksum": { "old": "9b5faf37366b3370fd98e3e60ca439c1.dir", "new": "83ae82fb367ac9926455870773ff09e6.dir", diff --git a/tests/unit/command/test_diff.py b/tests/unit/command/test_diff.py index fb29e7e76e..1f6b450d8f 100644 --- a/tests/unit/command/test_diff.py +++ b/tests/unit/command/test_diff.py @@ -5,7 +5,7 @@ def test_default(mocker, caplog): args = parse_args(["diff"]) cmd = args.func(args) diff = { - "added": [{"filename": "file", "checksum": "00000000"}], + "added": [{"path": "file", "checksum": "00000000"}], "deleted": [], "modified": [], } @@ -21,12 +21,12 @@ def test_checksums(mocker, caplog): diff = { "added": [], "deleted": [ - {"filename": "bar", "checksum": "00000000"}, - {"filename": "foo", "checksum": "11111111"}, + {"path": "bar", "checksum": "00000000"}, + {"path": "foo", "checksum": "11111111"}, ], "modified": [ { - "filename": "file", + "path": "file", "checksum": {"old": "AAAAAAAA", "new": "BBBBBBBB"}, } ], @@ -47,7 +47,7 @@ def test_json(mocker, caplog): args = parse_args(["diff", "--show-json"]) cmd = args.func(args) diff = { - "added": [{"filename": "file", "checksum": "00000000"}], + "added": [{"path": "file", "checksum": "00000000"}], "deleted": [], "modified": [], } @@ -55,7 +55,7 @@ def test_json(mocker, caplog): assert 0 == cmd.run() assert ( - '{"added": [{"filename": "file"}], "deleted": [], "modified": []}' + '{"added": [{"path": "file"}], "deleted": [], "modified": []}' in caplog.text ) @@ -64,7 +64,7 @@ def test_json_checksums(mocker, caplog): args = parse_args(["diff", "--show-json", "--checksums"]) cmd = args.func(args) diff = { - "added": [{"filename": "file", "checksum": "00000000"}], + "added": [{"path": "file", "checksum": "00000000"}], "deleted": [], "modified": [], } @@ -72,6 +72,6 @@ def test_json_checksums(mocker, caplog): assert 0 == cmd.run() assert ( - '{"added": [{"filename": "file", "checksum": "00000000"}], ' + '{"added": [{"path": "file", "checksum": "00000000"}], ' '"deleted": [], "modified": []}' in caplog.text ) From debf6cb510ae8c9188adf015fed12af0d9567285 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Wed, 29 Jan 2020 18:00:23 -0600 Subject: [PATCH 30/35] scm: dead code --- dvc/scm/base.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/dvc/scm/base.py b/dvc/scm/base.py index bbad771a01..51a52217bd 100644 --- a/dvc/scm/base.py +++ b/dvc/scm/base.py @@ -15,12 +15,6 @@ class FileNotInRepoError(SCMError): """ -class FileNotInCommitError(SCMError): - """Thrown when trying to find a file/directory that is not - in the specified commit in the repository. - """ - - class CloneError(SCMError): def __init__(self, url, path): super().__init__("Failed to clone repo '{}' to '{}'".format(url, path)) From e501549bd7b7b1e9d46b5c2cc01b6c99191e060a Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Wed, 29 Jan 2020 18:49:48 -0600 Subject: [PATCH 31/35] diff: add a summary at the bottom --- dvc/command/diff.py | 49 ++++++++++++++++++++++++--------- tests/unit/command/test_diff.py | 21 ++++++++++---- 2 files changed, 52 insertions(+), 18 deletions(-) diff --git a/dvc/command/diff.py b/dvc/command/diff.py index 3c6b2f447f..c8b6867851 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -36,11 +36,11 @@ def _format(diff): c157a790..f98bf6f1 bar If a group has no entries, it won't be included in the result. + + At the bottom, include a summary with the number of files per state. """ def _digest(checksum): - if not checksum: - return "" if type(checksum) is str: return checksum[0:8] return "{}..{}".format(checksum["old"][0:8], checksum["new"][0:8]) @@ -51,24 +51,47 @@ def _digest(checksum): "deleted": colorama.Fore.RED, } - return "\n\n".join( - "{color}{header}{nc}:\n{entries}".format( - color=colors[state], - header=state.capitalize(), - nc=colorama.Fore.RESET, - entries="\n".join( + summary = {} + groups = [] + + for state in ["added", "deleted", "modified"]: + summary[state] = 0 + entries = diff[state] + + if not entries: + continue + + content = [] + + for entry in entries: + path = entry["path"] + checksum = entry.get("checksum") + summary[state] += 1 if not path.endswith("/") else 0 + content.append( "{space}{checksum}{separator}{path}".format( space=" ", - checksum=_digest(entry.get("checksum")), - separator=" " if entry.get("checksum") else "", + checksum=_digest(checksum) if checksum else "", + separator=" " if checksum else "", path=entry["path"], ) - for entry in diff[state] - ), + ) + + groups.append( + "{color}{header}{nc}:\n{content}".format( + color=colors[state], + header=state.capitalize(), + nc=colorama.Fore.RESET, + content="\n".join(content), + ) ) - for state in ["added", "deleted", "modified"] + + groups.append( + "summary: added ({added}), deleted ({deleted})," + " modified ({modified})".format_map(summary) ) + return "\n\n".join(groups) + def run(self): try: diff = self.repo.diff(self.args.a_ref, self.args.b_ref) diff --git a/tests/unit/command/test_diff.py b/tests/unit/command/test_diff.py index 1f6b450d8f..fcf0209976 100644 --- a/tests/unit/command/test_diff.py +++ b/tests/unit/command/test_diff.py @@ -1,3 +1,5 @@ +import os + from dvc.cli import parse_args @@ -12,7 +14,12 @@ def test_default(mocker, caplog): mocker.patch("dvc.repo.Repo.diff", return_value=diff) assert 0 == cmd.run() - assert "Added:\n file\n" in caplog.text + assert ( + "Added:\n" + " file\n" + "\n" + "summary: added (1), deleted (0), modified (0)" + ) in caplog.text def test_checksums(mocker, caplog): @@ -21,8 +28,9 @@ def test_checksums(mocker, caplog): diff = { "added": [], "deleted": [ - {"path": "bar", "checksum": "00000000"}, - {"path": "foo", "checksum": "11111111"}, + {"path": os.path.join("data", ""), "checksum": "XXXXXXXX.dir"}, + {"path": os.path.join("data", "bar"), "checksum": "00000000"}, + {"path": os.path.join("data", "foo"), "checksum": "11111111"}, ], "modified": [ { @@ -35,11 +43,14 @@ def test_checksums(mocker, caplog): assert 0 == cmd.run() assert ( "Deleted:\n" - " 00000000 bar\n" - " 11111111 foo\n" + " XXXXXXXX " + os.path.join("data", "") + "\n" + " 00000000 " + os.path.join("data", "bar") + "\n" + " 11111111 " + os.path.join("data", "foo") + "\n" "\n" "Modified:\n" " AAAAAAAA..BBBBBBBB file\n" + "\n" + "summary: added (0), deleted (2), modified (1)" ) in caplog.text From e9bd9599a211bbdfcf2752fbb25d219ab7333214 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Thu, 30 Jan 2020 09:26:35 -0600 Subject: [PATCH 32/35] tests/diff: dont care about sorted jsons --- tests/unit/command/test_diff.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/tests/unit/command/test_diff.py b/tests/unit/command/test_diff.py index fcf0209976..bc8a65a27e 100644 --- a/tests/unit/command/test_diff.py +++ b/tests/unit/command/test_diff.py @@ -65,10 +65,9 @@ def test_json(mocker, caplog): mocker.patch("dvc.repo.Repo.diff", return_value=diff) assert 0 == cmd.run() - assert ( - '{"added": [{"path": "file"}], "deleted": [], "modified": []}' - in caplog.text - ) + assert '"added": [{"path": "file"}]' in caplog.text + assert '"deleted": []' in caplog.text + assert '"modified": []' in caplog.text def test_json_checksums(mocker, caplog): @@ -82,7 +81,6 @@ def test_json_checksums(mocker, caplog): mocker.patch("dvc.repo.Repo.diff", return_value=diff) assert 0 == cmd.run() - assert ( - '{"added": [{"path": "file", "checksum": "00000000"}], ' - '"deleted": [], "modified": []}' in caplog.text - ) + assert '"added": [{"path": "file", "checksum": "00000000"}]' in caplog.text + assert '"deleted": []' in caplog.text + assert '"modified": []' in caplog.text From 2c2295700dc9465ab65233c8d86882bc328a0f96 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Thu, 30 Jan 2020 09:52:51 -0600 Subject: [PATCH 33/35] wording: update command help Co-authored-by: @jorgeorpinel --- dvc/command/diff.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/dvc/command/diff.py b/dvc/command/diff.py index c8b6867851..705111ed84 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -119,28 +119,31 @@ def run(self): def add_parser(subparsers, parent_parser): DIFF_DESCRIPTION = ( - "Show diff of a data file or a directory that is under DVC control.\n" - "Some basic statistics summary, how many files were deleted/changed." + "Compare two different versions of your DVC project (tracked by Git)" + " and shows a list of paths grouped in the following categories:" + " added, modified, or deleted." ) - DIFF_HELP = "Show a diff of a DVC controlled data file or a directory." diff_parser = subparsers.add_parser( "diff", parents=[parent_parser], description=append_doc_link(DIFF_DESCRIPTION, "diff"), - help=DIFF_HELP, + help=DIFF_DESCRIPTION, formatter_class=argparse.RawDescriptionHelpFormatter, ) diff_parser.add_argument( "a_ref", - help="Git reference from which diff calculates (defaults to HEAD)", + help=( + "Git reference to the old version that you want to compare" + " (defaults to HEAD)" + ), nargs="?", default="HEAD", ) diff_parser.add_argument( "b_ref", help=( - "Git reference until which diff calculates, if omitted " - "diff shows the difference between the working tree and a_ref" + "Git reference to the new version that you want to compare." + " (defaults to the working tree)" ), nargs="?", ) From 793a069644d084c282d826a6499c50552df28b79 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Thu, 30 Jan 2020 09:59:29 -0600 Subject: [PATCH 34/35] tests/diff: order dictionary (second try) --- tests/unit/command/test_diff.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/unit/command/test_diff.py b/tests/unit/command/test_diff.py index bc8a65a27e..5d5a594616 100644 --- a/tests/unit/command/test_diff.py +++ b/tests/unit/command/test_diff.py @@ -1,3 +1,4 @@ +import collections import os from dvc.cli import parse_args @@ -73,8 +74,14 @@ def test_json(mocker, caplog): def test_json_checksums(mocker, caplog): args = parse_args(["diff", "--show-json", "--checksums"]) cmd = args.func(args) + diff = { - "added": [{"path": "file", "checksum": "00000000"}], + "added": [ + # py35: maintain a consistent key order for tests purposes + collections.OrderedDict( + [("path", "file"), ("checksum", "00000000")] + ) + ], "deleted": [], "modified": [], } From 673afe20b1e859ba09c1d98da7ec33ff5abff7a4 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Thu, 30 Jan 2020 10:28:02 -0600 Subject: [PATCH 35/35] diff: be windows friendly at file counting --- dvc/command/diff.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dvc/command/diff.py b/dvc/command/diff.py index 705111ed84..fdf8657da7 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -1,6 +1,7 @@ import argparse import json import logging +import os import colorama @@ -66,7 +67,7 @@ def _digest(checksum): for entry in entries: path = entry["path"] checksum = entry.get("checksum") - summary[state] += 1 if not path.endswith("/") else 0 + summary[state] += 1 if not path.endswith(os.sep) else 0 content.append( "{space}{checksum}{separator}{path}".format( space=" ",