diff --git a/dvc/command/diff.py b/dvc/command/diff.py index 7e064adda0..fdf8657da7 100644 --- a/dvc/command/diff.py +++ b/dvc/command/diff.py @@ -1,12 +1,11 @@ import argparse +import json import logging +import os -import humanize -import inflect -from funcy import compact +import colorama -from dvc.command.base import append_doc_link -from dvc.command.base import CmdBase +from dvc.command.base import CmdBase, append_doc_link from dvc.exceptions import DvcException @@ -15,156 +14,150 @@ class CmdDiff(CmdBase): @staticmethod - def _print_size(size): - if size < 0: - change = "decreased by {}" - elif size > 0: - change = "increased by {}" - else: - change = "not changed" - natur_size = humanize.naturalsize(abs(size)) - return change.format(natur_size) + def _format(diff): + """ + Given a diff structure, generate a string of paths separated + by new lines and grouped together by their state. + + A group's header is colored and its entries are sorted to enhance + readability, for example: + + Added: + another_file.txt + backup.tar + dir/ + dir/1 + + An example of a diff formatted when entries contain checksums: + + Added: + d3b07384 foo + + Modified: + c157a790..f98bf6f1 bar + + If a group has no entries, it won't be included in the result. + + At the bottom, include a summary with the number of files per state. + """ + + def _digest(checksum): + if type(checksum) is str: + return checksum[0:8] + return "{}..{}".format(checksum["old"][0:8], checksum["new"][0:8]) + + colors = { + "added": colorama.Fore.GREEN, + "modified": colorama.Fore.YELLOW, + "deleted": colorama.Fore.RED, + } + + summary = {} + groups = [] + + for state in ["added", "deleted", "modified"]: + summary[state] = 0 + entries = diff[state] + + if not entries: + continue + + content = [] + + for entry in entries: + path = entry["path"] + checksum = entry.get("checksum") + summary[state] += 1 if not path.endswith(os.sep) else 0 + content.append( + "{space}{checksum}{separator}{path}".format( + space=" ", + checksum=_digest(checksum) if checksum else "", + separator=" " if checksum else "", + path=entry["path"], + ) + ) + + groups.append( + "{color}{header}{nc}:\n{content}".format( + color=colors[state], + header=state.capitalize(), + nc=colorama.Fore.RESET, + content="\n".join(content), + ) + ) - @staticmethod - def _get_md5_string(sign, file_name, checksum): - sample_msg = "" - if file_name: - sample_msg = "{}{} with md5 {}\n" - sample_msg = sample_msg.format(sign, file_name, checksum) - return sample_msg - - @classmethod - def _get_dir_changes(cls, dct): - import dvc.repo.diff as diff - - engine = inflect.engine() - changes_msg = ( - "{} {} untouched, {} {} modified, {} {} added, " - "{} {} deleted, size was {}" + groups.append( + "summary: added ({added}), deleted ({deleted})," + " modified ({modified})".format_map(summary) ) - changes_msg = changes_msg.format( - dct[diff.DIFF_IDENT], - engine.plural("file", dct[diff.DIFF_IDENT]), - dct[diff.DIFF_CHANGE], - engine.plural("file", dct[diff.DIFF_CHANGE]), - dct[diff.DIFF_NEW], - engine.plural("file", dct[diff.DIFF_NEW]), - dct[diff.DIFF_DEL], - engine.plural("file", dct[diff.DIFF_DEL]), - cls._print_size(dct[diff.DIFF_SIZE]), - ) - return changes_msg - - @classmethod - def _get_file_changes(cls, dct): - import dvc.repo.diff as diff - - if ( - dct.get(diff.DIFF_OLD_FILE) - and dct.get(diff.DIFF_NEW_FILE) - and dct[diff.DIFF_SIZE] == 0 - ): - msg = "file size was not changed" - elif dct.get(diff.DIFF_NEW_FILE): - msg = "added file with size {}".format( - humanize.naturalsize(dct[diff.DIFF_SIZE]) - ) - elif dct.get(diff.DIFF_OLD_FILE): - msg = "deleted file with size {}".format( - humanize.naturalsize(abs(dct[diff.DIFF_SIZE])) - ) - else: - msg = "file was modified, file size {}".format( - cls._print_size(dct[diff.DIFF_SIZE]) - ) - return msg - @classmethod - def _get_royal_changes(cls, dct): - import dvc.repo.diff as diff + return "\n\n".join(groups) - if dct[diff.DIFF_SIZE] != diff.DIFF_SIZE_UNKNOWN: - if dct.get("is_dir"): - return cls._get_dir_changes(dct) - else: - return cls._get_file_changes(dct) - return "size is ?" + def run(self): + try: + diff = self.repo.diff(self.args.a_ref, self.args.b_ref) - @classmethod - def _show(cls, diff_dct): - import dvc.repo.diff as diff + if not any(diff.values()): + return 0 - msg = "dvc diff from {} to {}".format( - diff_dct[diff.DIFF_A_REF], diff_dct[diff.DIFF_B_REF] - ) - if diff_dct.get(diff.DIFF_EQUAL): - logger.info(msg) - return - for dct in diff_dct[diff.DIFF_LIST]: - msg += "\n\ndiff for '{}'\n".format(dct[diff.DIFF_TARGET]) - msg += cls._get_md5_string( - "-", - dct.get(diff.DIFF_OLD_FILE), - dct.get(diff.DIFF_OLD_CHECKSUM), - ) - msg += cls._get_md5_string( - "+", - dct.get(diff.DIFF_NEW_FILE), - dct.get(diff.DIFF_NEW_CHECKSUM), - ) - msg += "\n" - msg += cls._get_royal_changes(dct) - logger.info(msg) - return msg + if not self.args.checksums: + for _, entries in diff.items(): + for entry in entries: + del entry["checksum"] + + if self.args.show_json: + res = json.dumps(diff) + else: + res = self._format(diff) + + logger.info(res) - def run(self): - try: - msg = self.repo.diff( - self.args.a_ref, target=self.args.target, b_ref=self.args.b_ref - ) - self._show(msg) except DvcException: - msg = "failed to get 'diff {}'" - args = " ".join( - compact([self.args.target, self.args.a_ref, self.args.b_ref]) - ) - msg = msg.format(args) - logger.exception(msg) + logger.exception("failed to get diff") return 1 return 0 def add_parser(subparsers, parent_parser): DIFF_DESCRIPTION = ( - "Show diff of a data file or a directory that is under DVC control.\n" - "Some basic statistics summary, how many files were deleted/changed." + "Compare two different versions of your DVC project (tracked by Git)" + " and shows a list of paths grouped in the following categories:" + " added, modified, or deleted." ) - DIFF_HELP = "Show a diff of a DVC controlled data file or a directory." diff_parser = subparsers.add_parser( "diff", parents=[parent_parser], description=append_doc_link(DIFF_DESCRIPTION, "diff"), - help=DIFF_HELP, + help=DIFF_DESCRIPTION, formatter_class=argparse.RawDescriptionHelpFormatter, ) diff_parser.add_argument( - "-t", - "--target", + "a_ref", help=( - "Source path to a data file or directory. Default None. " - "If not specified, compares all files and directories " - "that are under DVC control in the current working space." + "Git reference to the old version that you want to compare" + " (defaults to HEAD)" ), - ) - diff_parser.add_argument( - "a_ref", help="Git reference from which diff calculates" + nargs="?", + default="HEAD", ) diff_parser.add_argument( "b_ref", help=( - "Git reference until which diff calculates, if omitted " - "diff shows the difference between current HEAD and a_ref" + "Git reference to the new version that you want to compare." + " (defaults to the working tree)" ), nargs="?", ) + diff_parser.add_argument( + "--show-json", + help="Format the output into a JSON", + action="store_true", + default=False, + ) + diff_parser.add_argument( + "--checksums", + help="Display checksums for each entry", + action="store_true", + default=False, + ) diff_parser.set_defaults(func=CmdDiff) diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index 165b286b99..b8970c7b35 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -1,246 +1,76 @@ import os -from errno import ENOENT - -from dvc import logger -from dvc.scm.base import FileNotInCommitError -from dvc.scm.git import DIFF_A_REF, DIFF_B_REF, DIFF_A_TREE, DIFF_B_TREE -from dvc.scm.git import DIFF_EQUAL -from . import locked - - -DIFF_TARGET = "target" -DIFF_IS_DIR = "is_dir" -DIFF_OLD_FILE = "old_file" -DIFF_OLD_CHECKSUM = "old_checksum" -DIFF_NEW_FILE = "new_file" -DIFF_NEW_CHECKSUM = "new_checksum" -DIFF_SIZE = "size_diff" -DIFF_DEL = "del" -DIFF_IDENT = "ident" -DIFF_CHANGE = "changes" -DIFF_NEW = "new" -DIFF_MOVE = "moves" -DIFF_LIST = "diffs" -DIFF_SIZE_UNKNOWN = "?" -DIFF_A_OUTPUT = "a_output" -DIFF_B_OUTPUT = "b_output" -DIFF_DELETED = "deleted_file" -DIFF_IS_NEW = "created_file" - - -def _file_not_exists(error, result): - if error.errno == ENOENT: - result.update({DIFF_SIZE: DIFF_SIZE_UNKNOWN}) - else: - raise error - - -def _extract_dir(self, dir_not_exists, output): - """Extract the content of dvc tree file - Args: - self(object) - Repo class instance - dir_not_exists(bool) - flag for directory existence - output(object) - OutputLOCAL class instance - Returns: - dict - dictionary with keys - paths to file in .dvc/cache - values -checksums for that files - """ - if not dir_not_exists: - lst = output.dir_cache - return {i["relpath"]: i["md5"] for i in lst} - return {} - - -def _get_dir_info(dir_not_exists, a_output): - if not dir_not_exists: - return str(a_output), a_output.checksum - return "", "" - - -def _ident_files(a_entries, b_entries): - keys = [ - key for key in a_entries.keys() if b_entries.get(key) == a_entries[key] - ] - return len(keys) - - -def _modified_files(self, a_entries, b_entries): - keys = [key for key in a_entries.keys() if key in b_entries] - diff_size = 0 - modified_count = 0 - for key in keys: - if a_entries[key] != b_entries[key]: - modified_count += 1 - diff_size += os.path.getsize( - self.cache.local.get(b_entries[key]) - ) - os.path.getsize(self.cache.local.get(a_entries[key])) - return modified_count, diff_size - - -def _deleted_files(self, a_entries, b_entries): - diff_size = 0 - deleted_count = 0 - for key, value in a_entries.items(): - if key not in b_entries: - deleted_count += 1 - diff_size -= os.path.getsize(self.cache.local.get(a_entries[key])) - return deleted_count, diff_size - - -def _new_files(self, a_entries, b_entries): - diff_size = 0 - new_count = 0 - for key, value in b_entries.items(): - if key not in a_entries: - new_count += 1 - diff_size += os.path.getsize(self.cache.local.get(b_entries[key])) - return new_count, diff_size - - -def _get_tree_changes(self, a_entries, b_entries): - result = { - DIFF_DEL: 0, - DIFF_IDENT: 0, - DIFF_CHANGE: 0, - DIFF_NEW: 0, - DIFF_MOVE: 0, - } - result[DIFF_IDENT] = _ident_files(a_entries, b_entries) - result[DIFF_CHANGE], diff_size = _modified_files( - self, a_entries, b_entries - ) - result[DIFF_SIZE] = diff_size - result[DIFF_DEL], diff_size = _deleted_files(self, a_entries, b_entries) - result[DIFF_SIZE] += diff_size - result[DIFF_NEW], diff_size = _new_files(self, a_entries, b_entries) - result[DIFF_SIZE] += diff_size - return result - - -def _check_local_cache(a_out, is_checked): - if a_out is not None and a_out.scheme != "local": - is_checked.append(str(a_out)) - return True - return False - - -def _is_dir(path, a_outs, b_outs): - if a_outs.get(path): - return a_outs[path].is_dir_checksum - else: - return b_outs[path].is_dir_checksum - - -def _get_diff_outs(self, diff_dct): - self.tree = diff_dct[DIFF_A_TREE] - a_outs = {str(out): out for st in self.stages for out in st.outs} - self.tree = diff_dct[DIFF_B_TREE] - b_outs = {str(out): out for st in self.stages for out in st.outs} - outs_paths = set(a_outs.keys()) - outs_paths.update(b_outs.keys()) - results = {} - non_local_cache = [] - for path in outs_paths: - check1 = _check_local_cache(a_outs.get(path), non_local_cache) - check2 = _check_local_cache(b_outs.get(path), non_local_cache) - # skip files/directories with non-local cache for now - if check1 or check2: - continue - results[path] = {} - results[path][DIFF_A_OUTPUT] = a_outs.get(path) - results[path][DIFF_B_OUTPUT] = b_outs.get(path) - results[path][DIFF_IS_NEW] = path not in a_outs - results[path][DIFF_DELETED] = path not in b_outs - results[path][DIFF_IS_DIR] = _is_dir(path, a_outs, b_outs) - if non_local_cache: - logger.warning( - "Diff is not supported for non-local outputs. Ignoring: {}".format( - non_local_cache - ) - ) - - return results - - -def _diff_dir(self, target, diff_dct): - result = {DIFF_TARGET: target} - result[DIFF_IS_DIR] = True - a_entries, b_entries = {}, {} - try: - a_entries = _extract_dir( - self, diff_dct[DIFF_IS_NEW], diff_dct[DIFF_A_OUTPUT] - ) - b_entries = _extract_dir( - self, diff_dct[DIFF_DELETED], diff_dct[DIFF_B_OUTPUT] - ) - result[DIFF_OLD_FILE], result[DIFF_OLD_CHECKSUM] = _get_dir_info( - diff_dct[DIFF_IS_NEW], diff_dct[DIFF_A_OUTPUT] - ) - result[DIFF_NEW_FILE], result[DIFF_NEW_CHECKSUM] = _get_dir_info( - diff_dct[DIFF_DELETED], diff_dct[DIFF_B_OUTPUT] - ) - result.update(_get_tree_changes(self, a_entries, b_entries)) - except IOError as e: - _file_not_exists(e, result) - return result - - -def _diff_file(self, target, diff_dct): - result = {DIFF_TARGET: target} - size = 0 - try: - if not diff_dct[DIFF_IS_NEW]: - result[DIFF_OLD_FILE] = target - result[DIFF_OLD_CHECKSUM] = diff_dct[DIFF_A_OUTPUT].checksum - size -= os.path.getsize( - self.cache.local.get(diff_dct[DIFF_A_OUTPUT].checksum) - ) - if not diff_dct[DIFF_DELETED]: - result[DIFF_NEW_FILE] = target - result[DIFF_NEW_CHECKSUM] = diff_dct[DIFF_B_OUTPUT].checksum - size += os.path.getsize( - self.cache.local.get(diff_dct[DIFF_B_OUTPUT].checksum) - ) - result[DIFF_SIZE] = size - except IOError as e: - _file_not_exists(e, result) - return result - - -def _diff_royal(self, target, diff_dct): - if diff_dct[DIFF_IS_DIR]: - return _diff_dir(self, target, diff_dct) - return _diff_file(self, target, diff_dct) +from dvc.exceptions import DvcException +from dvc.repo import locked +from dvc.scm.git import Git -@locked -def diff(self, a_ref, target=None, b_ref=None): - """Generates diff message string output - Args: - target(str) - file/directory to check diff of - a_ref(str) - first tag - (optional) b_ref(str) - second git tag +@locked +def diff(self, a_ref="HEAD", b_ref=None): + """ + By default, it compares the working tree with the last commit's tree. - Returns: - string: string of output message with diff info + This implementation differs from `git diff` since DVC doesn't have + the concept of `index`, but it keeps the same interface, thus, + `dvc diff` would be the same as `dvc diff HEAD`. """ - result = {} - diff_dct = self.scm.get_diff_trees(a_ref, b_ref=b_ref) - result[DIFF_A_REF] = diff_dct[DIFF_A_REF] - result[DIFF_B_REF] = diff_dct[DIFF_B_REF] - if diff_dct[DIFF_EQUAL]: - result[DIFF_EQUAL] = True + if type(self.scm) is not Git: + raise DvcException("only supported for Git repositories") + + def _paths_checksums(): + """ + A dictionary of checksums addressed by relpaths collected from + the current tree outputs. + + Unpack directories to include their entries + + To help distinguish between a directory and a file output, + the former one will come with a trailing slash in the path: + + directory: "data/" + file: "data" + """ + result = {} + + for stage in self.stages: + for output in stage.outs: + if not output.is_dir_checksum: + result.update({str(output): output.checksum}) + continue + + result.update({os.path.join(str(output), ""): output.checksum}) + + for entry in output.dir_cache: + path = str(output.path_info / entry["relpath"]) + result.update({path: entry["md5"]}) + return result - result[DIFF_LIST] = [] - diff_outs = _get_diff_outs(self, diff_dct) - if target is None: - result[DIFF_LIST] = [ - _diff_royal(self, path, diff_outs[path]) for path in diff_outs - ] - elif target in diff_outs: - result[DIFF_LIST] = [_diff_royal(self, target, diff_outs[target])] - else: - msg = "Have not found file/directory '{}' in the commits" - raise FileNotInCommitError(msg.format(target)) - return result + + working_tree = self.tree + a_tree = self.scm.get_tree(a_ref) + b_tree = self.scm.get_tree(b_ref) if b_ref else working_tree + + try: + self.tree = a_tree + old = _paths_checksums() + + self.tree = b_tree + new = _paths_checksums() + finally: + self.tree = working_tree + + # Compare paths between the old and new tree. + # set() efficiently converts dict keys to a set + added = sorted(set(new) - set(old)) + deleted = sorted(set(old) - set(new)) + modified = sorted(set(old) & set(new)) + + return { + "added": [{"path": path, "checksum": new[path]} for path in added], + "deleted": [{"path": path, "checksum": old[path]} for path in deleted], + "modified": [ + {"path": path, "checksum": {"old": old[path], "new": new[path]}} + for path in modified + if old[path] != new[path] + ], + } diff --git a/dvc/scm/base.py b/dvc/scm/base.py index 412731ca89..51a52217bd 100644 --- a/dvc/scm/base.py +++ b/dvc/scm/base.py @@ -15,22 +15,13 @@ class FileNotInRepoError(SCMError): """ -class FileNotInCommitError(SCMError): - """Thrown when trying to find a file/directory that is not - in the specified commit in the repository. - """ - - class CloneError(SCMError): def __init__(self, url, path): super().__init__("Failed to clone repo '{}' to '{}'".format(url, path)) class RevError(SCMError): - def __init__(self, url, rev): - super().__init__( - "Failed to access revision '{}' for repo '{}'".format(rev, url) - ) + pass class Base(object): diff --git a/dvc/scm/git/__init__.py b/dvc/scm/git/__init__.py index 7d7f17648d..c87d6c4693 100644 --- a/dvc/scm/git/__init__.py +++ b/dvc/scm/git/__init__.py @@ -22,13 +22,6 @@ logger = logging.getLogger(__name__) -DIFF_A_TREE = "a_tree" -DIFF_B_TREE = "b_tree" -DIFF_A_REF = "a_ref" -DIFF_B_REF = "b_ref" -DIFF_EQUAL = "equal" - - class Git(Base): """Class for managing Git.""" @@ -94,7 +87,11 @@ def clone(url, to_path, rev=None): try: repo.checkout(rev) except git.exc.GitCommandError as exc: - raise RevError(url, rev) from exc + raise RevError( + "failed to access revision '{}' for repo '{}'".format( + rev, url + ) + ) from exc return repo @@ -316,66 +313,18 @@ def belongs_to_scm(self, path): return basename == self.ignore_file or Git.GIT_DIR in path_parts def get_tree(self, rev): - return GitTree(self.repo, rev) - - def _get_diff_trees(self, a_ref, b_ref): - """Private method for getting the trees and commit hashes of 2 git - references. Requires `gitdb` module (from gitpython package). - - Args: - a_ref (str): git reference - b_ref (str): second git reference. If None, uses HEAD - - Returns: - tuple: tuple with elements: (trees, commits) - """ - from gitdb.exc import BadObject, BadName - - trees = {DIFF_A_TREE: None, DIFF_B_TREE: None} - commits = [] - if b_ref is None: - b_ref = self.repo.head.commit - try: - a_commit = self.repo.git.rev_parse(a_ref, short=True) - b_commit = self.repo.git.rev_parse(b_ref, short=True) - # See https://gitpython.readthedocs.io - # /en/2.1.11/reference.html#git.objects.base.Object.__str__ - commits.append(a_commit) - commits.append(b_commit) - trees[DIFF_A_TREE] = self.get_tree(commits[0]) - trees[DIFF_B_TREE] = self.get_tree(commits[1]) - except (BadName, BadObject) as exc: - raise SCMError("git problem") from exc - return trees, commits - - def get_diff_trees(self, a_ref, b_ref=None): - """Method for getting two repo trees between two git tag commits. - Returns the dvc hash names of changed file/directory - - Args: - a_ref (str): git reference - b_ref (str): optional second git reference, default None - - Returns: - dict: dictionary with keys: {a_ref, b_ref, equal} - or {a_ref, b_ref, a_tree, b_tree} - """ - diff_dct = {DIFF_EQUAL: False} - trees, commits = self._get_diff_trees(a_ref, b_ref) - diff_dct[DIFF_A_REF] = commits[0] - diff_dct[DIFF_B_REF] = commits[1] - if commits[0] == commits[1]: - diff_dct[DIFF_EQUAL] = True - return diff_dct - diff_dct[DIFF_A_TREE] = trees[DIFF_A_TREE] - diff_dct[DIFF_B_TREE] = trees[DIFF_B_TREE] - return diff_dct + return GitTree(self.repo, self.resolve_rev(rev)) def get_rev(self): return self.repo.git.rev_parse("HEAD") def resolve_rev(self, rev): - return self.repo.git.rev_parse(rev) + from git.exc import GitCommandError + + try: + return self.repo.git.rev_parse(rev) + except GitCommandError: + raise RevError("unknown Git revision '{}'".format(rev)) def close(self): self.repo.close() diff --git a/scripts/completion/dvc.bash b/scripts/completion/dvc.bash index 2af965a67d..8c2d3b0612 100644 --- a/scripts/completion/dvc.bash +++ b/scripts/completion/dvc.bash @@ -22,7 +22,7 @@ _dvc_checkout='-d --with-deps -R --recursive -f --force --relink $(compgen -G *. _dvc_commit='-f --force -d --with-deps -R --recursive $(compgen -G *.dvc)' _dvc_config='-u --unset --local --system --global' _dvc_destroy='-f --force' -_dvc_diff='-t --target' +_dvc_diff='-t --show-json --checksums' _dvc_fetch='-j --jobs -r --remote -a --all-branches -T --all-tags -d --with-deps -R --recursive $(compgen -G *.dvc)' _dvc_get_url='' _dvc_get='-o --out --rev --show-url' diff --git a/scripts/completion/dvc.zsh b/scripts/completion/dvc.zsh index 32bdef47ff..03f9fa69c1 100644 --- a/scripts/completion/dvc.zsh +++ b/scripts/completion/dvc.zsh @@ -97,7 +97,9 @@ _dvc_destroy=( ) _dvc_diff=( - {-t,--target}"[Source path to a data file or directory.]:Target files:" + "--show-json[Format the output into a JSON]" + "--checksums[Display checksums for each entry]" + {1,2}":Git revision (e.g. branch, tag, SHA):" ) _dvc_fetch=( diff --git a/tests/func/test_diff.py b/tests/func/test_diff.py index 15d2cc982f..7af8f3afca 100644 --- a/tests/func/test_diff.py +++ b/tests/func/test_diff.py @@ -1,267 +1,172 @@ +import hashlib import os +import pytest +import shutil -import dvc.repo.diff as diff -from dvc.main import main -from dvc.scm.base import FileNotInCommitError -from tests.basic_env import TestDvcGit - - -class TestDiff(TestDvcGit): - def setUp(self): - super().setUp() - - self.new_file = "new_test_file" - self.create(self.new_file, self.new_file) - stage = self.dvc.add(self.new_file)[0] - self.a_ref = self.git.git.rev_parse(self.git.head.commit, short=True) - self.new_checksum = stage.outs[0].checksum - self.git.index.add([self.new_file + ".dvc"]) - self.git.index.commit("adds new_file") - self.test_dct = { - diff.DIFF_A_REF: self.a_ref, - diff.DIFF_B_REF: self.git.git.rev_parse( - self.git.head.commit, short=True - ), - diff.DIFF_LIST: [ - { - diff.DIFF_TARGET: self.new_file, - diff.DIFF_NEW_FILE: self.new_file, - diff.DIFF_NEW_CHECKSUM: self.new_checksum, - diff.DIFF_SIZE: 13, - } - ], - } - - def test(self): - out = self.dvc.scm.get_diff_trees(self.a_ref) - self.assertFalse(out[diff.DIFF_EQUAL]) - self.assertEqual(self.a_ref, out[diff.DIFF_A_REF]) - self.assertEqual( - self.git.git.rev_parse(self.git.head.commit, short=True), - out[diff.DIFF_B_REF], - ) - - -class TestDiffRepo(TestDiff): - def test(self): - result = self.dvc.diff(self.a_ref, target=self.new_file) - self.assertEqual(self.test_dct, result) - - -class TestDiffCmdLine(TestDiff): - def test(self): - ret = main(["diff", "-t", self.new_file, self.a_ref]) - self.assertEqual(ret, 0) - - -class TestDiffDir(TestDvcGit): - def setUp(self): - super().setUp() - - stage = self.dvc.add(self.DATA_DIR)[0] - self.git.index.add([self.DATA_DIR + ".dvc"]) - self.git.index.commit("adds data_dir") - self.a_ref = self.git.git.rev_parse( - self.dvc.scm.repo.head.commit, short=True - ) - self.old_checksum = stage.outs[0].checksum - self.new_file = os.path.join(self.DATA_SUB_DIR, diff.DIFF_NEW_FILE) - self.create(self.new_file, self.new_file) - stage = self.dvc.add(self.DATA_DIR)[0] - self.git.index.add([self.DATA_DIR + ".dvc"]) - self.git.index.commit(message="adds data_dir with new_file") - self.new_checksum = stage.outs[0].checksum - - def test(self): - out = self.dvc.scm.get_diff_trees(self.a_ref) - self.assertFalse(out[diff.DIFF_EQUAL]) - self.assertEqual(self.a_ref, out[diff.DIFF_A_REF]) - self.assertEqual( - self.git.git.rev_parse(self.git.head.commit, short=True), - out[diff.DIFF_B_REF], - ) - - -class TestDiffDirRepo(TestDiffDir): - maxDiff = None - - def test(self): - result = self.dvc.diff(self.a_ref, target=self.DATA_DIR) - test_dct = { - diff.DIFF_A_REF: self.git.git.rev_parse(self.a_ref, short=True), - diff.DIFF_B_REF: self.git.git.rev_parse( - self.git.head.commit, short=True - ), - diff.DIFF_LIST: [ - { - diff.DIFF_CHANGE: 0, - diff.DIFF_DEL: 0, - diff.DIFF_IDENT: 2, - diff.DIFF_MOVE: 0, - diff.DIFF_NEW: 1, - diff.DIFF_IS_DIR: True, - diff.DIFF_TARGET: self.DATA_DIR, - diff.DIFF_NEW_FILE: self.DATA_DIR, - diff.DIFF_NEW_CHECKSUM: self.new_checksum, - diff.DIFF_OLD_FILE: self.DATA_DIR, - diff.DIFF_OLD_CHECKSUM: self.old_checksum, - diff.DIFF_SIZE: 30, - } - ], - } - self.assertEqual(test_dct, result) - - -class TestDiffDirRepoDeletedFile(TestDiffDir): - maxDiff = None - - def setUp(self): - super().setUp() - - self.b_ref = self.a_ref - tmp = self.new_checksum - self.new_checksum = self.old_checksum - self.a_ref = str(self.dvc.scm.repo.head.commit) - self.old_checksum = tmp - - def test(self): - result = self.dvc.diff( - self.a_ref, b_ref=self.b_ref, target=self.DATA_DIR - ) - test_dct = { - diff.DIFF_A_REF: self.git.git.rev_parse(self.a_ref, short=True), - diff.DIFF_B_REF: self.git.git.rev_parse(self.b_ref, short=True), - diff.DIFF_LIST: [ - { - diff.DIFF_CHANGE: 0, - diff.DIFF_DEL: 1, - diff.DIFF_IDENT: 2, - diff.DIFF_MOVE: 0, - diff.DIFF_NEW: 0, - diff.DIFF_IS_DIR: True, - diff.DIFF_TARGET: self.DATA_DIR, - diff.DIFF_NEW_FILE: self.DATA_DIR, - diff.DIFF_NEW_CHECKSUM: self.new_checksum, - diff.DIFF_OLD_FILE: self.DATA_DIR, - diff.DIFF_OLD_CHECKSUM: self.old_checksum, - diff.DIFF_SIZE: -30, - } - ], - } - self.assertEqual(test_dct, result) - - -class TestDiffFileNotFound(TestDiffDir): - def setUp(self): - super().setUp() - self.unknown_file = "unknown_file_" + str(id(self)) - - def test(self): - with self.assertRaises(FileNotInCommitError): - self.dvc.diff(self.a_ref, target=self.unknown_file) - - -class TestDiffModifiedFile(TestDiff): - maxDiff = None - - def setUp(self): - super().setUp() - - self.old_checksum = self.new_checksum - self.new_file_content = "new_test_file_bigger_content_123456789" - self.diff_len = len(self.new_file) + len(self.new_file_content) - self.create(self.new_file, self.new_file_content) - stage = self.dvc.add(self.new_file)[0] - self.git.index.add([self.new_file + ".dvc"]) - self.git.index.commit("change new_file content to be bigger") - self.new_checksum = stage.outs[0].checksum - self.b_ref = self.git.git.rev_parse(self.git.head.commit, short=True) - - def test(self): - result = self.dvc.diff( - self.a_ref, b_ref=self.b_ref, target=self.new_file - ) - test_dct = { - diff.DIFF_A_REF: self.git.git.rev_parse(self.a_ref, short=True), - diff.DIFF_B_REF: self.git.git.rev_parse(self.b_ref, short=True), - diff.DIFF_LIST: [ - { - diff.DIFF_NEW_CHECKSUM: self.new_checksum, - diff.DIFF_NEW_FILE: self.new_file, - diff.DIFF_TARGET: self.new_file, - diff.DIFF_SIZE: self.diff_len, - } - ], - } - self.assertEqual(test_dct, result) - - -class TestDiffDirWithFile(TestDiffDir): - maxDiff = None - - def setUp(self): - super().setUp() - - self.a_ref = self.git.git.rev_parse(self.git.head.commit, short=True) - self.old_checksum = self.new_checksum - self.new_file_content = "new_test_file_bigger_content_123456789" - self.diff_len = len(self.new_file_content) - self.create(self.new_file, self.new_file_content) - stage = self.dvc.add(self.DATA_DIR)[0] - self.git.index.add([self.DATA_DIR + ".dvc"]) - self.git.index.commit(message="modify file in the data dir") - self.new_checksum = stage.outs[0].checksum - self.b_ref = self.git.git.rev_parse(self.git.head.commit, short=True) - - def test(self): - result = self.dvc.diff(self.a_ref, target=self.DATA_DIR) - test_dct = { - diff.DIFF_A_REF: self.git.git.rev_parse(self.a_ref, short=True), - diff.DIFF_B_REF: self.git.git.rev_parse(self.b_ref, short=True), - diff.DIFF_LIST: [ - { - diff.DIFF_IDENT: 2, - diff.DIFF_CHANGE: 1, - diff.DIFF_DEL: 0, - diff.DIFF_MOVE: 0, - diff.DIFF_NEW: 0, - diff.DIFF_IS_DIR: True, - diff.DIFF_TARGET: self.DATA_DIR, - diff.DIFF_NEW_FILE: self.DATA_DIR, - diff.DIFF_NEW_CHECKSUM: self.new_checksum, - diff.DIFF_OLD_FILE: self.DATA_DIR, - diff.DIFF_OLD_CHECKSUM: self.old_checksum, - diff.DIFF_SIZE: self.diff_len, - } - ], - } - self.assertEqual(test_dct, result) - - -class TestDiffCmdMessage(TestDiff): - maxDiff = None - - def test(self): - ret = main( - [ - "diff", - self.test_dct[diff.DIFF_A_REF], - self.test_dct[diff.DIFF_B_REF], - ] - ) - self.assertEqual(ret, 0) - - msg1 = "dvc diff from {0} to {1}".format( - self.git.git.rev_parse(self.test_dct[diff.DIFF_A_REF], short=True), - self.git.git.rev_parse(self.test_dct[diff.DIFF_B_REF], short=True), - ) - msg2 = "diff for '{0}'".format( - self.test_dct[diff.DIFF_LIST][0][diff.DIFF_TARGET] - ) - msg3 = "+{0} with md5 {1}".format( - self.test_dct[diff.DIFF_LIST][0][diff.DIFF_TARGET], - self.test_dct[diff.DIFF_LIST][0][diff.DIFF_NEW_CHECKSUM], - ) - msg4 = "added file with size 13 Bytes" - for m in [msg1, msg2, msg3, msg4]: - assert m in self._caplog.text +from dvc.compat import fspath +from dvc.exceptions import DvcException + + +def digest(text): + return hashlib.md5(bytes(text, "utf-8")).hexdigest() + + +def test_no_scm(tmp_dir, dvc): + tmp_dir.dvc_gen("file", "text") + + with pytest.raises(DvcException, match=r"only supported for Git repos"): + dvc.diff() + + +def test_added(tmp_dir, scm, dvc): + tmp_dir.dvc_gen("file", "text") + + assert dvc.diff() == { + "added": [{"path": "file", "checksum": digest("text")}], + "deleted": [], + "modified": [], + } + + +def test_no_cache_entry(tmp_dir, scm, dvc): + tmp_dir.dvc_gen("file", "first", commit="add a file") + + tmp_dir.dvc_gen({"dir": {"1": "1", "2": "2"}}) + tmp_dir.dvc_gen("file", "second") + + shutil.rmtree(fspath(tmp_dir / ".dvc" / "cache")) + (tmp_dir / ".dvc" / "state").unlink() + + dir_checksum = "5fb6b29836c388e093ca0715c872fe2a.dir" + + assert dvc.diff() == { + "added": [ + {"path": os.path.join("dir", ""), "checksum": dir_checksum}, + {"path": os.path.join("dir", "1"), "checksum": digest("1")}, + {"path": os.path.join("dir", "2"), "checksum": digest("2")}, + ], + "deleted": [], + "modified": [ + { + "path": "file", + "checksum": {"old": digest("first"), "new": digest("second")}, + } + ], + } + + +def test_deleted(tmp_dir, scm, dvc): + tmp_dir.dvc_gen("file", "text", commit="add file") + (tmp_dir / "file.dvc").unlink() + + assert dvc.diff() == { + "added": [], + "deleted": [{"path": "file", "checksum": digest("text")}], + "modified": [], + } + + +def test_modified(tmp_dir, scm, dvc): + tmp_dir.dvc_gen("file", "first", commit="first version") + tmp_dir.dvc_gen("file", "second") + + assert dvc.diff() == { + "added": [], + "deleted": [], + "modified": [ + { + "path": "file", + "checksum": {"old": digest("first"), "new": digest("second")}, + } + ], + } + + +def test_refs(tmp_dir, scm, dvc): + tmp_dir.dvc_gen("file", "first", commit="first version") + tmp_dir.dvc_gen("file", "second", commit="second version") + tmp_dir.dvc_gen("file", "third", commit="third version") + + HEAD_2 = digest("first") + HEAD_1 = digest("second") + HEAD = digest("third") + + assert dvc.diff("HEAD~1") == { + "added": [], + "deleted": [], + "modified": [ + {"path": "file", "checksum": {"old": HEAD_1, "new": HEAD}} + ], + } + + assert dvc.diff("HEAD~2", "HEAD~1") == { + "added": [], + "deleted": [], + "modified": [ + {"path": "file", "checksum": {"old": HEAD_2, "new": HEAD_1}} + ], + } + + with pytest.raises(DvcException, match=r"unknown Git revision 'missing'"): + dvc.diff("missing") + + +def test_directories(tmp_dir, scm, dvc): + tmp_dir.dvc_gen({"dir": {"1": "1", "2": "2"}}, commit="add a directory") + tmp_dir.dvc_gen({"dir": {"3": "3"}}, commit="add a file") + tmp_dir.dvc_gen({"dir": {"2": "two"}}, commit="modify a file") + + (tmp_dir / "dir" / "2").unlink() + dvc.add("dir") + scm.add("dir.dvc") + scm.commit("delete a file") + + # The ":/" format is a way to specify revisions by commit message: + # https://git-scm.com/docs/revisions + # + assert dvc.diff(":/init", ":/directory") == { + "added": [ + { + "path": os.path.join("dir", ""), + "checksum": "5fb6b29836c388e093ca0715c872fe2a.dir", + }, + {"path": os.path.join("dir", "1"), "checksum": digest("1")}, + {"path": os.path.join("dir", "2"), "checksum": digest("2")}, + ], + "deleted": [], + "modified": [], + } + + assert dvc.diff(":/directory", ":/modify") == { + "added": [{"path": os.path.join("dir", "3"), "checksum": digest("3")}], + "deleted": [], + "modified": [ + { + "path": os.path.join("dir", ""), + "checksum": { + "old": "5fb6b29836c388e093ca0715c872fe2a.dir", + "new": "9b5faf37366b3370fd98e3e60ca439c1.dir", + }, + }, + { + "path": os.path.join("dir", "2"), + "checksum": {"old": digest("2"), "new": digest("two")}, + }, + ], + } + + assert dvc.diff(":/modify", ":/delete") == { + "added": [], + "deleted": [ + {"path": os.path.join("dir", "2"), "checksum": digest("two")} + ], + "modified": [ + { + "path": os.path.join("dir", ""), + "checksum": { + "old": "9b5faf37366b3370fd98e3e60ca439c1.dir", + "new": "83ae82fb367ac9926455870773ff09e6.dir", + }, + } + ], + } diff --git a/tests/unit/command/test_diff.py b/tests/unit/command/test_diff.py new file mode 100644 index 0000000000..5d5a594616 --- /dev/null +++ b/tests/unit/command/test_diff.py @@ -0,0 +1,93 @@ +import collections +import os + +from dvc.cli import parse_args + + +def test_default(mocker, caplog): + args = parse_args(["diff"]) + cmd = args.func(args) + diff = { + "added": [{"path": "file", "checksum": "00000000"}], + "deleted": [], + "modified": [], + } + mocker.patch("dvc.repo.Repo.diff", return_value=diff) + + assert 0 == cmd.run() + assert ( + "Added:\n" + " file\n" + "\n" + "summary: added (1), deleted (0), modified (0)" + ) in caplog.text + + +def test_checksums(mocker, caplog): + args = parse_args(["diff", "--checksums"]) + cmd = args.func(args) + diff = { + "added": [], + "deleted": [ + {"path": os.path.join("data", ""), "checksum": "XXXXXXXX.dir"}, + {"path": os.path.join("data", "bar"), "checksum": "00000000"}, + {"path": os.path.join("data", "foo"), "checksum": "11111111"}, + ], + "modified": [ + { + "path": "file", + "checksum": {"old": "AAAAAAAA", "new": "BBBBBBBB"}, + } + ], + } + mocker.patch("dvc.repo.Repo.diff", return_value=diff) + assert 0 == cmd.run() + assert ( + "Deleted:\n" + " XXXXXXXX " + os.path.join("data", "") + "\n" + " 00000000 " + os.path.join("data", "bar") + "\n" + " 11111111 " + os.path.join("data", "foo") + "\n" + "\n" + "Modified:\n" + " AAAAAAAA..BBBBBBBB file\n" + "\n" + "summary: added (0), deleted (2), modified (1)" + ) in caplog.text + + +def test_json(mocker, caplog): + args = parse_args(["diff", "--show-json"]) + cmd = args.func(args) + diff = { + "added": [{"path": "file", "checksum": "00000000"}], + "deleted": [], + "modified": [], + } + mocker.patch("dvc.repo.Repo.diff", return_value=diff) + + assert 0 == cmd.run() + assert '"added": [{"path": "file"}]' in caplog.text + assert '"deleted": []' in caplog.text + assert '"modified": []' in caplog.text + + +def test_json_checksums(mocker, caplog): + args = parse_args(["diff", "--show-json", "--checksums"]) + cmd = args.func(args) + + diff = { + "added": [ + # py35: maintain a consistent key order for tests purposes + collections.OrderedDict( + [("path", "file"), ("checksum", "00000000")] + ) + ], + "deleted": [], + "modified": [], + } + mocker.patch("dvc.repo.Repo.diff", return_value=diff) + + assert 0 == cmd.run() + assert '"added": [{"path": "file", "checksum": "00000000"}]' in caplog.text + assert '"deleted": []' in caplog.text + assert '"modified": []' in caplog.text