From 66b7b1f9b54348a61a5cba6004ced22f7e8eeae3 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Fri, 6 Nov 2020 00:55:43 +0200 Subject: [PATCH] dvc: use DirInfo instead of dicts/lists Currently we are converting dir_info to/from lists all the time. The reason is that dir_info is stored as list of dicts in *.dir files, but that makes it hard to work with. In addition to that, we will likely be changing .dir file format in the near future #829, so we need to abstract away dir_info into something that we won't care how it will be stored on disk. Related #3256 Related #4847 --- dvc/cache/base.py | 144 ++++------------- dvc/dir_info.py | 102 ++++++++++++ dvc/output/base.py | 6 +- dvc/tree/base.py | 42 ++--- dvc/tree/dvc.py | 16 +- tests/func/remote/test_index.py | 12 +- tests/func/test_add.py | 4 +- tests/func/test_cache.py | 7 +- tests/func/test_checkout.py | 7 +- tests/func/test_remote.py | 24 ++- tests/func/test_tree.py | 6 +- tests/unit/output/test_local.py | 8 +- tests/unit/test_dir_info.py | 273 ++++++++++++++++++++++++++++++++ tests/unit/tree/test_repo.py | 2 +- 14 files changed, 469 insertions(+), 184 deletions(-) create mode 100644 dvc/dir_info.py create mode 100644 tests/unit/test_dir_info.py diff --git a/dvc/cache/base.py b/dvc/cache/base.py index 29ad0e9db8..35f4a1af7d 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -1,21 +1,18 @@ import json import logging from copy import copy -from operator import itemgetter from funcy import decorator from shortuuid import uuid import dvc.prompt as prompt +from dvc.dir_info import DirInfo from dvc.exceptions import ( CacheLinkError, CheckoutError, ConfirmRemoveError, DvcException, - MergeError, ) -from dvc.hash_info import HashInfo -from dvc.path_info import WindowsPathInfo from dvc.progress import Tqdm from dvc.remote.slow_link_detection import slow_link_guard @@ -77,7 +74,7 @@ def get_dir_cache(self, hash_info): try: dir_info = self.load_dir_cache(hash_info) except DirCacheError: - dir_info = [] + dir_info = DirInfo() self._dir_info[hash_info.value] = dir_info return dir_info @@ -96,18 +93,9 @@ def load_dir_cache(self, hash_info): "dir cache file format error '%s' [skipping the file]", path_info, ) - return [] - - if self.tree.PATH_CLS == WindowsPathInfo: - # only need to convert it for Windows - for info in d: - # NOTE: here is a BUG, see comment to .as_posix() below - relpath = info[self.tree.PARAM_RELPATH] - info[self.tree.PARAM_RELPATH] = relpath.replace( - "/", self.tree.PATH_CLS.sep - ) + d = [] - return d + return DirInfo.from_list(d) def changed(self, path_info, hash_info): """Checks if data has changed. @@ -271,14 +259,9 @@ def _get_dir_info_hash(self, dir_info): from dvc.path_info import PathInfo from dvc.utils import tmp_fname - # Sorting the list by path to ensure reproducibility - if isinstance(dir_info, dict): - dir_info = self._from_dict(dir_info) - dir_info = sorted(dir_info, key=itemgetter(self.tree.PARAM_RELPATH)) - tmp = tempfile.NamedTemporaryFile(delete=False).name with open(tmp, "w+") as fobj: - json.dump(dir_info, fobj, sort_keys=True) + json.dump(dir_info.to_list(), fobj, sort_keys=True) from_info = PathInfo(tmp) to_info = self.tree.path_info / tmp_fname("") @@ -286,8 +269,8 @@ def _get_dir_info_hash(self, dir_info): hash_info = self.tree.get_file_hash(to_info) hash_info.value += self.tree.CHECKSUM_DIR_SUFFIX - hash_info.dir_info = self._to_dict(dir_info) - hash_info.nfiles = len(dir_info) + hash_info.dir_info = dir_info + hash_info.nfiles = dir_info.nfiles return hash_info, to_info @@ -312,14 +295,13 @@ def save_dir_info(self, dir_info, hash_info=None): def _save_dir(self, path_info, tree, hash_info, save_link=True, **kwargs): if not hash_info.dir_info: - hash_info.dir_info = self._to_dict( - tree.cache.get_dir_cache(hash_info) - ) + hash_info.dir_info = tree.cache.get_dir_cache(hash_info) hi = self.save_dir_info(hash_info.dir_info, hash_info) - for relpath, entry_hash in Tqdm( - hi.dir_info.items(), desc="Saving " + path_info.name, unit="file" + for entry_info, entry_hash in Tqdm( + hi.dir_info.items(path_info), + desc="Saving " + path_info.name, + unit="file", ): - entry_info = path_info / relpath self._save_file( entry_info, tree, entry_hash, save_link=False, **kwargs ) @@ -404,13 +386,9 @@ def _changed_dir_cache(self, hash_info, path_info=None, filter_info=None): if self.changed_cache_file(hash_info): return True - for entry in self.get_dir_cache(hash_info): - entry_hash = HashInfo( - self.tree.PARAM_CHECKSUM, entry[self.tree.PARAM_CHECKSUM] - ) - + dir_info = self.get_dir_cache(hash_info) + for entry_info, entry_hash in dir_info.items(path_info): if path_info and filter_info: - entry_info = path_info / entry[self.tree.PARAM_RELPATH] if not entry_info.isin_or_eq(filter_info): continue @@ -488,16 +466,14 @@ def _checkout_dir( logger.debug("Linking directory '%s'.", path_info) - for entry in dir_info: - relative_path = entry[self.tree.PARAM_RELPATH] - entry_hash = entry[self.tree.PARAM_CHECKSUM] - entry_cache_info = self.tree.hash_to_path_info(entry_hash) - entry_info = path_info / relative_path + for entry_info, entry_hash_info in dir_info.items(path_info): + entry_cache_info = self.tree.hash_to_path_info( + entry_hash_info.value + ) if filter_info and not entry_info.isin_or_eq(filter_info): continue - entry_hash_info = HashInfo(self.tree.PARAM_CHECKSUM, entry_hash) if relink or self.changed(entry_info, entry_hash_info): modified = True self.safe_remove(entry_info, force=force) @@ -520,9 +496,7 @@ def _checkout_dir( def _remove_redundant_files(self, path_info, dir_info, force): existing_files = set(self.tree.walk_files(path_info)) - needed_files = { - path_info / entry[self.tree.PARAM_RELPATH] for entry in dir_info - } + needed_files = {info for info, _ in dir_info.items(path_info)} redundant_files = existing_files - needed_files for path in redundant_files: self.safe_remove(path, force) @@ -623,77 +597,19 @@ def get_files_number(self, path_info, hash_info, filter_info): return 1 if not filter_info: - return len(self.get_dir_cache(hash_info)) + return self.get_dir_cache(hash_info).nfiles return ilen( - filter_info.isin_or_eq(path_info / entry[self.tree.PARAM_CHECKSUM]) - for entry in self.get_dir_cache(hash_info) - ) - - def _to_dict(self, dir_info): - info = {} - for _entry in dir_info: - entry = _entry.copy() - relpath = entry.pop(self.tree.PARAM_RELPATH) - info[relpath] = HashInfo.from_dict(entry) - return info - - def _from_dict(self, dir_dict): - return [ - {self.tree.PARAM_RELPATH: relpath, **hash_info.to_dict()} - for relpath, hash_info in dir_dict.items() - ] - - @staticmethod - def _diff(ancestor, other, allow_removed=False): - from dictdiffer import diff - - allowed = ["add"] - if allow_removed: - allowed.append("remove") - - result = list(diff(ancestor, other)) - for typ, _, _ in result: - if typ not in allowed: - raise MergeError( - "unable to auto-merge directories with diff that contains " - f"'{typ}'ed files" - ) - return result - - def _merge_dirs(self, ancestor_info, our_info, their_info): - from dictdiffer import patch - - ancestor = self._to_dict(ancestor_info) - our = self._to_dict(our_info) - their = self._to_dict(their_info) - - our_diff = self._diff(ancestor, our) - if not our_diff: - return self._from_dict(their) - - their_diff = self._diff(ancestor, their) - if not their_diff: - return self._from_dict(our) - - # make sure there are no conflicting files - self._diff(our, their, allow_removed=True) - - merged = patch(our_diff + their_diff, ancestor, in_place=True) - - # Sorting the list by path to ensure reproducibility - return sorted( - self._from_dict(merged), key=itemgetter(self.tree.PARAM_RELPATH), + filter_info.isin_or_eq(path_info / relpath) + for relpath, _ in self.get_dir_cache(hash_info).items() ) def _get_dir_size(self, dir_info): - def _getsize(entry): - return self.tree.getsize( - self.tree.hash_to_path_info(entry[self.tree.PARAM_CHECKSUM]) - ) - try: - return sum(_getsize(entry) for entry in dir_info) + return sum( + self.tree.getsize(self.tree.hash_to_path_info(hi.value)) + for _, hi in dir_info.items() + ) except FileNotFoundError: return None @@ -704,12 +620,12 @@ def merge(self, ancestor_info, our_info, their_info): if ancestor_info: ancestor = self.get_dir_cache(ancestor_info) else: - ancestor = [] + ancestor = DirInfo() our = self.get_dir_cache(our_info) their = self.get_dir_cache(their_info) - merged = self._merge_dirs(ancestor, our, their) + merged = our.merge(ancestor, their) hash_info = self.save_dir_info(merged) hash_info.size = self._get_dir_size(merged) return hash_info @@ -728,5 +644,5 @@ def get_hash(self, tree, path_info): def set_dir_info(self, hash_info): assert hash_info.isdir - hash_info.dir_info = self._to_dict(self.get_dir_cache(hash_info)) - hash_info.nfiles = len(hash_info.dir_info) + hash_info.dir_info = self.get_dir_cache(hash_info) + hash_info.nfiles = hash_info.dir_info.nfiles diff --git a/dvc/dir_info.py b/dvc/dir_info.py new file mode 100644 index 0000000000..55fab135c2 --- /dev/null +++ b/dvc/dir_info.py @@ -0,0 +1,102 @@ +import posixpath +from operator import itemgetter + +from pygtrie import Trie + +from .hash_info import HashInfo + + +def _diff(ancestor, other, allow_removed=False): + from dictdiffer import diff + + from dvc.exceptions import MergeError + + allowed = ["add"] + if allow_removed: + allowed.append("remove") + + result = list(diff(ancestor, other)) + for typ, _, _ in result: + if typ not in allowed: + raise MergeError( + "unable to auto-merge directories with diff that contains " + f"'{typ}'ed files" + ) + return result + + +def _merge(ancestor, our, their): + import copy + + from dictdiffer import patch + + our_diff = _diff(ancestor, our) + if not our_diff: + return copy.deepcopy(their) + + their_diff = _diff(ancestor, their) + if not their_diff: + return copy.deepcopy(our) + + # make sure there are no conflicting files + _diff(our, their, allow_removed=True) + + return patch(our_diff + their_diff, ancestor) + + +class DirInfo: + PARAM_RELPATH = "relpath" + + def __init__(self): + self.trie = Trie() + + @property + def size(self): + try: + return sum( + hash_info.size + for _, hash_info in self.trie.iteritems() # noqa: B301 + ) + except TypeError: + return None + + @property + def nfiles(self): + return len(self.trie) + + def items(self, path_info=None): + for key, hash_info in self.trie.iteritems(): # noqa: B301 + path = posixpath.sep.join(key) + if path_info is not None: + path = path_info / path + yield path, hash_info + + @classmethod + def from_list(cls, lst): + ret = DirInfo() + for _entry in lst: + entry = _entry.copy() + relpath = entry.pop(cls.PARAM_RELPATH) + parts = tuple(relpath.split(posixpath.sep)) + ret.trie[parts] = HashInfo.from_dict(entry) + return ret + + def to_list(self): + # Sorting the list by path to ensure reproducibility + return sorted( + ( + { + # NOTE: not using hash_info.to_dict() because we don't want + # size/nfiles fields at this point. + hash_info.name: hash_info.value, + self.PARAM_RELPATH: posixpath.sep.join(parts), + } + for parts, hash_info in self.trie.iteritems() # noqa: B301 + ), + key=itemgetter(self.PARAM_RELPATH), + ) + + def merge(self, ancestor, their): + merged = DirInfo() + merged.trie = _merge(ancestor.trie, self.trie, their.trie) + return merged diff --git a/dvc/output/base.py b/dvc/output/base.py index b70936f576..68f6f3a8eb 100644 --- a/dvc/output/base.py +++ b/dvc/output/base.py @@ -448,9 +448,7 @@ def collect_used_dir_cache( path = str(self.path_info) filter_path = str(filter_info) if filter_info else None is_win = os.name == "nt" - for entry in self.dir_cache: - checksum = entry[self.tree.PARAM_CHECKSUM] - entry_relpath = entry[self.tree.PARAM_RELPATH] + for entry_relpath, entry_hash_info in self.dir_cache.items(): if is_win: entry_relpath = entry_relpath.replace("/", os.sep) entry_path = os.path.join(path, entry_relpath) @@ -459,7 +457,7 @@ def collect_used_dir_cache( or entry_path == filter_path or entry_path.startswith(filter_path + os.sep) ): - cache.add(self.scheme, checksum, entry_path) + cache.add(self.scheme, entry_hash_info.value, entry_path) return cache diff --git a/dvc/tree/base.py b/dvc/tree/base.py index b4d4c8556a..fb1fbc3e56 100644 --- a/dvc/tree/base.py +++ b/dvc/tree/base.py @@ -7,6 +7,7 @@ from funcy import cached_property, decorator +from dvc.dir_info import DirInfo from dvc.exceptions import DvcException, DvcIgnoreInCollectedDirError from dvc.ignore import DvcIgnore from dvc.path_info import URLInfo @@ -50,7 +51,6 @@ class BaseTree: PATH_CLS = URLInfo JOBS = 4 * cpu_count() - PARAM_RELPATH = "relpath" CHECKSUM_DIR_SUFFIX = ".dir" HASH_JOBS = max(1, min(4, cpu_count() // 2)) DEFAULT_VERIFY = False @@ -318,35 +318,25 @@ def _collect_dir(self, path_info, **kwargs): new_hash_infos = self._calculate_hashes(not_in_state) hash_infos.update(new_hash_infos) - sizes = [hi.size for hi in hash_infos.values()] - if None in sizes: - size = None - else: - size = sum(sizes) - - dir_info = [ - { - self.PARAM_CHECKSUM: hash_infos[fi].value, - # NOTE: this is lossy transformation: - # "hey\there" -> "hey/there" - # "hey/there" -> "hey/there" - # The latter is fine filename on Windows, which - # will transform to dir/file on back transform. - # - # Yes, this is a BUG, as long as we permit "/" in - # filenames on Windows and "\" on Unix - self.PARAM_RELPATH: fi.relative_to(path_info).as_posix(), - } - for fi in file_infos - ] - - return (dir_info, size) + dir_info = DirInfo() + for fi, hi in hash_infos.items(): + # NOTE: this is lossy transformation: + # "hey\there" -> "hey/there" + # "hey/there" -> "hey/there" + # The latter is fine filename on Windows, which + # will transform to dir/file on back transform. + # + # Yes, this is a BUG, as long as we permit "/" in + # filenames on Windows and "\" on Unix + dir_info.trie[fi.relative_to(path_info).parts] = hi + + return dir_info @use_state def get_dir_hash(self, path_info, **kwargs): - dir_info, size = self._collect_dir(path_info, **kwargs) + dir_info = self._collect_dir(path_info, **kwargs) hash_info = self.repo.cache.local.save_dir_info(dir_info) - hash_info.size = size + hash_info.size = dir_info.size return hash_info def upload(self, from_info, to_info, name=None, no_progress_bar=False): diff --git a/dvc/tree/dvc.py b/dvc/tree/dvc.py index 2290d28b1f..5d09c642a5 100644 --- a/dvc/tree/dvc.py +++ b/dvc/tree/dvc.py @@ -32,7 +32,6 @@ class DvcTree(BaseTree): # pylint:disable=abstract-method scheme = "local" PARAM_CHECKSUM = "md5" - _cached_dir_cache = None _dir_entry_hashes = {} def __init__(self, repo, fetch=False, stream=False): @@ -60,11 +59,9 @@ def _get_granular_hash( raise FileNotFoundError # NOTE: use string paths here for performance reasons - path_str = relpath(path_info, out.path_info) - if os.name == "nt": - path_str = path_str.replace(os.sep, "/") + key = tuple(relpath(path_info, out.path_info).split(os.sep)) out.get_dir_cache(remote=remote) - file_hash = out.hash_info.dir_info.get(path_str) + file_hash = out.hash_info.dir_info.trie.get(key) if file_hash: return file_hash raise FileNotFoundError @@ -166,12 +163,9 @@ def _add_dir(self, top, trie, out, **kwargs): self._fetch_dir(out, filter_info=top, **kwargs) - for entry in out.dir_cache: - entry_relpath = entry[out.tree.PARAM_RELPATH] - if os.name == "nt": - entry_relpath = entry_relpath.replace("/", os.sep) - path_info = out.path_info / entry_relpath - trie[path_info.parts] = None + base = out.path_info.parts + for key in out.dir_cache.trie.iterkeys(): # noqa: B301 + trie[base + key] = None def _walk(self, root, trie, topdown=True, **kwargs): dirs = set() diff --git a/tests/func/remote/test_index.py b/tests/func/remote/test_index.py index 64e28f412b..cc8ff00539 100644 --- a/tests/func/remote/test_index.py +++ b/tests/func/remote/test_index.py @@ -32,14 +32,16 @@ def hashes_exist(self, *args, **kwargs): def test_indexed_on_status(tmp_dir, dvc, tmp_path_factory, remote): foo = tmp_dir.dvc_gen({"foo": "foo content"})[0].outs[0] bar = tmp_dir.dvc_gen({"bar": {"baz": "baz content"}})[0].outs[0] - baz = bar.dir_cache[0] + baz_hash = bar.dir_cache.trie.get(("baz",)) dvc.push() with remote.index: remote.index.clear() dvc.status(cloud=True) with remote.index: - assert {bar.hash_info.value, baz["md5"]} == set(remote.index.hashes()) + assert {bar.hash_info.value, baz_hash.value} == set( + remote.index.hashes() + ) assert [bar.hash_info.value] == list(remote.index.dir_hashes()) assert foo.hash_info.value not in remote.index.hashes() @@ -47,11 +49,13 @@ def test_indexed_on_status(tmp_dir, dvc, tmp_path_factory, remote): def test_indexed_on_push(tmp_dir, dvc, tmp_path_factory, remote): foo = tmp_dir.dvc_gen({"foo": "foo content"})[0].outs[0] bar = tmp_dir.dvc_gen({"bar": {"baz": "baz content"}})[0].outs[0] - baz = bar.dir_cache[0] + baz_hash = bar.dir_cache.trie.get(("baz",)) dvc.push() with remote.index: - assert {bar.hash_info.value, baz["md5"]} == set(remote.index.hashes()) + assert {bar.hash_info.value, baz_hash.value} == set( + remote.index.hashes() + ) assert [bar.hash_info.value] == list(remote.index.dir_hashes()) assert foo.hash_info.value not in remote.index.hashes() diff --git a/tests/func/test_add.py b/tests/func/test_add.py index 93f63f73fb..017f02d0b1 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -80,8 +80,8 @@ def test_add_directory(tmp_dir, dvc): hash_info = stage.outs[0].hash_info dir_info = dvc.cache.local.load_dir_cache(hash_info) - for info in dir_info: - assert "\\" not in info["relpath"] + for path, _ in dir_info.trie.items(): + assert "\\" not in path class TestAddDirectoryRecursive(TestDvc): diff --git a/tests/func/test_cache.py b/tests/func/test_cache.py index 09eb047e70..b32fd2606e 100644 --- a/tests/func/test_cache.py +++ b/tests/func/test_cache.py @@ -6,6 +6,7 @@ from dvc.cache import Cache from dvc.cache.base import DirCacheError +from dvc.dir_info import DirInfo from dvc.hash_info import HashInfo from dvc.main import main from dvc.utils import relpath @@ -60,9 +61,11 @@ def test(self): self.dvc.cache.local.tree.hash_to_path_info(dir_hash) ) self.create(fname, '{"a": "b"}') - self._do_test( - self.dvc.cache.local.load_dir_cache(HashInfo("md5", dir_hash)) + dir_info = self.dvc.cache.local.load_dir_cache( + HashInfo("md5", dir_hash) ) + self.assertTrue(isinstance(dir_info, DirInfo)) + self.assertEqual(dir_info.nfiles, 0) class TestExternalCacheDir(TestDvc): diff --git a/tests/func/test_checkout.py b/tests/func/test_checkout.py index f1d962d77d..c61c684f35 100644 --- a/tests/func/test_checkout.py +++ b/tests/func/test_checkout.py @@ -100,10 +100,11 @@ def test(self): # NOTE: modifying cache file for one of the files inside the directory # to check if dvc will detect that the cache is corrupted. - entry = self.dvc.cache.local.load_dir_cache(out.hash_info)[0] - entry_hash = entry[self.dvc.cache.local.tree.PARAM_CHECKSUM] + _, entry_hash = next( + self.dvc.cache.local.load_dir_cache(out.hash_info).items() + ) cache = os.fspath( - self.dvc.cache.local.tree.hash_to_path_info(entry_hash) + self.dvc.cache.local.tree.hash_to_path_info(entry_hash.value) ) os.chmod(cache, 0o644) diff --git a/tests/func/test_remote.py b/tests/func/test_remote.py index a779b02104..b004c10fbf 100644 --- a/tests/func/test_remote.py +++ b/tests/func/test_remote.py @@ -7,6 +7,7 @@ from mock import patch from dvc.config import Config +from dvc.dir_info import DirInfo from dvc.exceptions import DownloadError, RemoteCacheRequiredError, UploadError from dvc.main import main from dvc.path_info import PathInfo @@ -149,23 +150,20 @@ def test_dir_hash_should_be_key_order_agnostic(tmp_dir, dvc): tmp_dir.gen({"data": {"1": "1 content", "2": "2 content"}}) path_info = PathInfo("data") + + dir_info = DirInfo.from_list( + [{"relpath": "1", "md5": "1"}, {"relpath": "2", "md5": "2"}] + ) with patch.object( - BaseTree, - "_collect_dir", - return_value=( - [{"relpath": "1", "md5": "1"}, {"relpath": "2", "md5": "2"}], - None, - ), + BaseTree, "_collect_dir", return_value=dir_info, ): hash1 = dvc.cache.local.tree.get_hash(path_info) + dir_info = DirInfo.from_list( + [{"md5": "1", "relpath": "1"}, {"md5": "2", "relpath": "2"}] + ) with patch.object( - BaseTree, - "_collect_dir", - return_value=( - [{"md5": "1", "relpath": "1"}, {"md5": "2", "relpath": "2"}], - None, - ), + BaseTree, "_collect_dir", return_value=dir_info, ): hash2 = dvc.cache.local.tree.get_hash(path_info) @@ -277,7 +275,7 @@ def test_push_order(tmp_dir, dvc, tmp_path_factory, mocker, local_remote): remote = dvc.cloud.get_remote("upstream") foo_path = remote.tree.hash_to_path_info(foo.hash_info.value) bar_path = remote.tree.hash_to_path_info( - foo.hash_info.dir_info["bar"].value + foo.hash_info.dir_info.trie[("bar",)].value ) paths = [args[1] for args, _ in mocked_upload.call_args_list] assert paths.index(foo_path) > paths.index(bar_path) diff --git a/tests/func/test_tree.py b/tests/func/test_tree.py index d0ff1dcd04..a17adad4ce 100644 --- a/tests/func/test_tree.py +++ b/tests/func/test_tree.py @@ -198,9 +198,9 @@ def test_repotree_walk_fetch(tmp_dir, dvc, scm, local_remote): pass assert os.path.exists(out.cache_path) - for entry in out.dir_cache: - hash_ = entry[out.tree.PARAM_CHECKSUM] - assert os.path.exists(dvc.cache.local.tree.hash_to_path_info(hash_)) + for _, hi in out.dir_cache.items(): + assert hi.name == out.tree.PARAM_CHECKSUM + assert os.path.exists(dvc.cache.local.tree.hash_to_path_info(hi.value)) def test_repotree_cache_save(tmp_dir, dvc, scm, erepo_dir, local_cloud): diff --git a/tests/unit/output/test_local.py b/tests/unit/output/test_local.py index 8469bf7795..3b1c4d1a4d 100644 --- a/tests/unit/output/test_local.py +++ b/tests/unit/output/test_local.py @@ -3,6 +3,7 @@ from mock import patch from dvc.cache.local import LocalCache +from dvc.dir_info import DirInfo from dvc.hash_info import HashInfo from dvc.output import LocalOutput from dvc.stage import Stage @@ -81,7 +82,12 @@ def test_return_0_on_no_cache(self): @patch.object( LocalCache, "get_dir_cache", - return_value=[{"md5": "asdf"}, {"md5": "qwe"}], + return_value=DirInfo.from_list( + [ + {"relpath": "foo", "md5": "asdf"}, + {"relpath": "bar", "md5": "qwe"}, + ] + ), ) def test_return_multiple_for_dir(self, _mock_get_dir_cache): o = self._get_output() diff --git a/tests/unit/test_dir_info.py b/tests/unit/test_dir_info.py new file mode 100644 index 0000000000..2222a5e13c --- /dev/null +++ b/tests/unit/test_dir_info.py @@ -0,0 +1,273 @@ +from operator import itemgetter + +import pytest +from pygtrie import Trie + +from dvc.dir_info import DirInfo, _merge +from dvc.hash_info import HashInfo +from dvc.path_info import PosixPathInfo, WindowsPathInfo + + +@pytest.mark.parametrize( + "lst, trie_dict", + [ + ([], {}), + ( + [ + {"md5": "def", "relpath": "zzz"}, + {"md5": "123", "relpath": "foo"}, + {"md5": "abc", "relpath": "aaa"}, + {"md5": "456", "relpath": "bar"}, + ], + { + ("zzz",): HashInfo("md5", "def"), + ("foo",): HashInfo("md5", "123"), + ("bar",): HashInfo("md5", "456"), + ("aaa",): HashInfo("md5", "abc"), + }, + ), + ( + [ + {"md5": "123", "relpath": "dir/b"}, + {"md5": "456", "relpath": "dir/z"}, + {"md5": "789", "relpath": "dir/a"}, + {"md5": "abc", "relpath": "b"}, + {"md5": "def", "relpath": "a"}, + {"md5": "ghi", "relpath": "z"}, + {"md5": "jkl", "relpath": "dir/subdir/b"}, + {"md5": "mno", "relpath": "dir/subdir/z"}, + {"md5": "pqr", "relpath": "dir/subdir/a"}, + ], + { + ("dir", "b"): HashInfo("md5", "123"), + ("dir", "z"): HashInfo("md5", "456"), + ("dir", "a"): HashInfo("md5", "789"), + ("b",): HashInfo("md5", "abc"), + ("a",): HashInfo("md5", "def"), + ("z",): HashInfo("md5", "ghi"), + ("dir", "subdir", "b"): HashInfo("md5", "jkl"), + ("dir", "subdir", "z"): HashInfo("md5", "mno"), + ("dir", "subdir", "a"): HashInfo("md5", "pqr"), + }, + ), + ], +) +def test_list(lst, trie_dict): + dir_info = DirInfo.from_list(lst) + assert dir_info.trie == Trie(trie_dict) + assert dir_info.to_list() == sorted(lst, key=itemgetter("relpath")) + + +@pytest.mark.parametrize( + "trie_dict, size", + [ + ({}, 0), + ( + { + ("a",): HashInfo("md5", "abc", size=1), + ("b",): HashInfo("md5", "def", size=2), + ("c",): HashInfo("md5", "ghi", size=3), + ("dir", "foo"): HashInfo("md5", "jkl", size=4), + ("dir", "bar"): HashInfo("md5", "mno", size=5), + ("dir", "baz"): HashInfo("md5", "pqr", size=6), + }, + 21, + ), + ( + { + ("a",): HashInfo("md5", "abc", size=1), + ("b",): HashInfo("md5", "def", size=None), + }, + None, + ), + ], +) +def test_size(trie_dict, size): + dir_info = DirInfo() + dir_info.trie = Trie(trie_dict) + assert dir_info.size == size + + +@pytest.mark.parametrize( + "trie_dict, nfiles", + [ + ({}, 0), + ( + { + ("a",): HashInfo("md5", "abc", size=1), + ("b",): HashInfo("md5", "def", size=2), + ("c",): HashInfo("md5", "ghi", size=3), + ("dir", "foo"): HashInfo("md5", "jkl", size=4), + ("dir", "bar"): HashInfo("md5", "mno", size=5), + ("dir", "baz"): HashInfo("md5", "pqr", size=6), + }, + 6, + ), + ( + { + ("a",): HashInfo("md5", "abc", size=1), + ("b",): HashInfo("md5", "def", size=None), + }, + 2, + ), + ], +) +def test_nfiles(trie_dict, nfiles): + dir_info = DirInfo() + dir_info.trie = Trie(trie_dict) + assert dir_info.nfiles == nfiles + + +@pytest.mark.parametrize( + "trie_dict, items", + [ + ({}, []), + ( + { + ("a",): HashInfo("md5", "abc"), + ("b",): HashInfo("md5", "def"), + ("c",): HashInfo("md5", "ghi"), + ("dir", "foo"): HashInfo("md5", "jkl"), + ("dir", "bar"): HashInfo("md5", "mno"), + ("dir", "baz"): HashInfo("md5", "pqr"), + ("dir", "subdir", "1"): HashInfo("md5", "stu"), + ("dir", "subdir", "2"): HashInfo("md5", "vwx"), + ("dir", "subdir", "3"): HashInfo("md5", "yz"), + }, + [ + ("a", HashInfo("md5", "abc")), + ("b", HashInfo("md5", "def")), + ("c", HashInfo("md5", "ghi")), + ("dir/foo", HashInfo("md5", "jkl")), + ("dir/bar", HashInfo("md5", "mno")), + ("dir/baz", HashInfo("md5", "pqr")), + ("dir/subdir/1", HashInfo("md5", "stu")), + ("dir/subdir/2", HashInfo("md5", "vwx")), + ("dir/subdir/3", HashInfo("md5", "yz")), + ], + ), + ], +) +def test_items(trie_dict, items): + dir_info = DirInfo() + dir_info.trie = Trie(trie_dict) + assert list(dir_info.items()) == items + + +@pytest.mark.parametrize( + "path_info, trie_dict, items", + [ + (PosixPathInfo(), {}, []), + (WindowsPathInfo(), {}, []), + ( + PosixPathInfo("/some/path"), + { + ("a",): HashInfo("md5", "abc"), + ("dir", "foo"): HashInfo("md5", "jkl"), + ("dir", "sub", "1"): HashInfo("md5", "stu"), + }, + [ + (PosixPathInfo("/some/path/a"), HashInfo("md5", "abc")), + (PosixPathInfo("/some/path/dir/foo"), HashInfo("md5", "jkl")), + ( + PosixPathInfo("/some/path/dir/sub/1"), + HashInfo("md5", "stu"), + ), + ], + ), + ( + WindowsPathInfo("C:\\some\\path"), + { + ("a",): HashInfo("md5", "abc"), + ("dir", "foo"): HashInfo("md5", "jkl"), + ("dir", "sub", "1"): HashInfo("md5", "stu"), + }, + [ + (WindowsPathInfo("C:\\some\\path\\a"), HashInfo("md5", "abc")), + ( + WindowsPathInfo("C:\\some\\path\\dir\\foo"), + HashInfo("md5", "jkl"), + ), + ( + WindowsPathInfo("C:\\some\\path\\dir\\sub\\1"), + HashInfo("md5", "stu"), + ), + ], + ), + ], +) +def test_items_with_path(path_info, trie_dict, items): + dir_info = DirInfo() + dir_info.trie = Trie(trie_dict) + assert list(dir_info.items(path_info)) == items + + +@pytest.mark.parametrize( + "ancestor_dict, our_dict, their_dict, merged_dict", + [ + ({}, {}, {}, {}), + ( + {("foo",): HashInfo("md5", "123")}, + { + ("foo",): HashInfo("md5", "123"), + ("bar",): HashInfo("md5", "345"), + }, + { + ("foo",): HashInfo("md5", "123"), + ("baz",): HashInfo("md5", "678"), + }, + { + ("foo",): HashInfo("md5", "123"), + ("bar",): HashInfo("md5", "345"), + ("baz",): HashInfo("md5", "678"), + }, + ), + ( + { + ("common",): HashInfo("md5", "123"), + ("subdir", "foo"): HashInfo("md5", "345"), + }, + { + ("common",): HashInfo("md5", "123"), + ("subdir", "foo"): HashInfo("md5", "345"), + ("subdir", "bar"): HashInfo("md5", "678"), + }, + { + ("common",): HashInfo("md5", "123"), + ("subdir", "foo"): HashInfo("md5", "345"), + ("subdir", "baz"): HashInfo("md5", "91011"), + }, + { + ("common",): HashInfo("md5", "123"), + ("subdir", "foo"): HashInfo("md5", "345"), + ("subdir", "bar"): HashInfo("md5", "678"), + ("subdir", "baz"): HashInfo("md5", "91011"), + }, + ), + ( + {}, + {("foo",): HashInfo("md5", "123")}, + {("bar",): HashInfo("md5", "456")}, + { + ("foo",): HashInfo("md5", "123"), + ("bar",): HashInfo("md5", "456"), + }, + ), + ( + {}, + {}, + {("bar",): HashInfo("md5", "123")}, + {("bar",): HashInfo("md5", "123")}, + ), + ( + {}, + {("bar",): HashInfo("md5", "123")}, + {}, + {("bar",): HashInfo("md5", "123")}, + ), + ], +) +def test_merge(ancestor_dict, our_dict, their_dict, merged_dict): + actual = _merge(Trie(ancestor_dict), Trie(our_dict), Trie(their_dict)) + expected = Trie(merged_dict) + assert actual == expected diff --git a/tests/unit/tree/test_repo.py b/tests/unit/tree/test_repo.py index 9042cf6fb7..0464186981 100644 --- a/tests/unit/tree/test_repo.py +++ b/tests/unit/tree/test_repo.py @@ -560,7 +560,7 @@ def test_get_hash_dirty_dir(tmp_dir, dvc): actual = tree.get_hash(PathInfo(tmp_dir) / "dir") expected = HashInfo("md5", "ba75a2162ca9c29acecb7957105a0bc2.dir") assert actual == expected - assert len(actual.dir_info) == 3 + assert actual.dir_info.nfiles == 3 @pytest.mark.parametrize("traverse_subrepos", [True, False])