Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dvc: use DirInfo instead of dicts/lists #4851

Merged
merged 1 commit into from
Nov 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 30 additions & 114 deletions dvc/cache/base.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,18 @@
import json
import logging
from copy import copy
from operator import itemgetter

from funcy import decorator
from shortuuid import uuid

import dvc.prompt as prompt
from dvc.dir_info import DirInfo
from dvc.exceptions import (
CacheLinkError,
CheckoutError,
ConfirmRemoveError,
DvcException,
MergeError,
)
from dvc.hash_info import HashInfo
from dvc.path_info import WindowsPathInfo
from dvc.progress import Tqdm
from dvc.remote.slow_link_detection import slow_link_guard

Expand Down Expand Up @@ -77,7 +74,7 @@ def get_dir_cache(self, hash_info):
try:
dir_info = self.load_dir_cache(hash_info)
except DirCacheError:
dir_info = []
dir_info = DirInfo()

self._dir_info[hash_info.value] = dir_info
return dir_info
Expand All @@ -96,18 +93,9 @@ def load_dir_cache(self, hash_info):
"dir cache file format error '%s' [skipping the file]",
path_info,
)
return []

if self.tree.PATH_CLS == WindowsPathInfo:
# only need to convert it for Windows
for info in d:
# NOTE: here is a BUG, see comment to .as_posix() below
relpath = info[self.tree.PARAM_RELPATH]
info[self.tree.PARAM_RELPATH] = relpath.replace(
"/", self.tree.PATH_CLS.sep
)
d = []

return d
return DirInfo.from_list(d)

def changed(self, path_info, hash_info):
"""Checks if data has changed.
Expand Down Expand Up @@ -271,23 +259,18 @@ def _get_dir_info_hash(self, dir_info):
from dvc.path_info import PathInfo
from dvc.utils import tmp_fname

# Sorting the list by path to ensure reproducibility
if isinstance(dir_info, dict):
dir_info = self._from_dict(dir_info)
dir_info = sorted(dir_info, key=itemgetter(self.tree.PARAM_RELPATH))

tmp = tempfile.NamedTemporaryFile(delete=False).name
with open(tmp, "w+") as fobj:
json.dump(dir_info, fobj, sort_keys=True)
json.dump(dir_info.to_list(), fobj, sort_keys=True)

from_info = PathInfo(tmp)
to_info = self.tree.path_info / tmp_fname("")
self.tree.upload(from_info, to_info, no_progress_bar=True)

hash_info = self.tree.get_file_hash(to_info)
hash_info.value += self.tree.CHECKSUM_DIR_SUFFIX
hash_info.dir_info = self._to_dict(dir_info)
hash_info.nfiles = len(dir_info)
hash_info.dir_info = dir_info
hash_info.nfiles = dir_info.nfiles

return hash_info, to_info

Expand All @@ -312,14 +295,13 @@ def save_dir_info(self, dir_info, hash_info=None):

def _save_dir(self, path_info, tree, hash_info, save_link=True, **kwargs):
if not hash_info.dir_info:
hash_info.dir_info = self._to_dict(
tree.cache.get_dir_cache(hash_info)
)
hash_info.dir_info = tree.cache.get_dir_cache(hash_info)
hi = self.save_dir_info(hash_info.dir_info, hash_info)
for relpath, entry_hash in Tqdm(
hi.dir_info.items(), desc="Saving " + path_info.name, unit="file"
for entry_info, entry_hash in Tqdm(
hi.dir_info.items(path_info),
desc="Saving " + path_info.name,
unit="file",
):
entry_info = path_info / relpath
self._save_file(
entry_info, tree, entry_hash, save_link=False, **kwargs
)
Expand Down Expand Up @@ -404,13 +386,9 @@ def _changed_dir_cache(self, hash_info, path_info=None, filter_info=None):
if self.changed_cache_file(hash_info):
return True

for entry in self.get_dir_cache(hash_info):
entry_hash = HashInfo(
self.tree.PARAM_CHECKSUM, entry[self.tree.PARAM_CHECKSUM]
)

dir_info = self.get_dir_cache(hash_info)
for entry_info, entry_hash in dir_info.items(path_info):
if path_info and filter_info:
entry_info = path_info / entry[self.tree.PARAM_RELPATH]
if not entry_info.isin_or_eq(filter_info):
continue

Expand Down Expand Up @@ -488,16 +466,14 @@ def _checkout_dir(

logger.debug("Linking directory '%s'.", path_info)

for entry in dir_info:
relative_path = entry[self.tree.PARAM_RELPATH]
entry_hash = entry[self.tree.PARAM_CHECKSUM]
entry_cache_info = self.tree.hash_to_path_info(entry_hash)
entry_info = path_info / relative_path
for entry_info, entry_hash_info in dir_info.items(path_info):
entry_cache_info = self.tree.hash_to_path_info(
entry_hash_info.value
)

if filter_info and not entry_info.isin_or_eq(filter_info):
continue

entry_hash_info = HashInfo(self.tree.PARAM_CHECKSUM, entry_hash)
if relink or self.changed(entry_info, entry_hash_info):
modified = True
self.safe_remove(entry_info, force=force)
Expand All @@ -520,9 +496,7 @@ def _checkout_dir(
def _remove_redundant_files(self, path_info, dir_info, force):
existing_files = set(self.tree.walk_files(path_info))

needed_files = {
path_info / entry[self.tree.PARAM_RELPATH] for entry in dir_info
}
needed_files = {info for info, _ in dir_info.items(path_info)}
redundant_files = existing_files - needed_files
for path in redundant_files:
self.safe_remove(path, force)
Expand Down Expand Up @@ -623,77 +597,19 @@ def get_files_number(self, path_info, hash_info, filter_info):
return 1

if not filter_info:
return len(self.get_dir_cache(hash_info))
return self.get_dir_cache(hash_info).nfiles

return ilen(
filter_info.isin_or_eq(path_info / entry[self.tree.PARAM_CHECKSUM])
for entry in self.get_dir_cache(hash_info)
)

def _to_dict(self, dir_info):
info = {}
for _entry in dir_info:
entry = _entry.copy()
relpath = entry.pop(self.tree.PARAM_RELPATH)
info[relpath] = HashInfo.from_dict(entry)
return info

def _from_dict(self, dir_dict):
return [
{self.tree.PARAM_RELPATH: relpath, **hash_info.to_dict()}
for relpath, hash_info in dir_dict.items()
]

@staticmethod
def _diff(ancestor, other, allow_removed=False):
from dictdiffer import diff

allowed = ["add"]
if allow_removed:
allowed.append("remove")

result = list(diff(ancestor, other))
for typ, _, _ in result:
if typ not in allowed:
raise MergeError(
"unable to auto-merge directories with diff that contains "
f"'{typ}'ed files"
)
return result

def _merge_dirs(self, ancestor_info, our_info, their_info):
from dictdiffer import patch

ancestor = self._to_dict(ancestor_info)
our = self._to_dict(our_info)
their = self._to_dict(their_info)

our_diff = self._diff(ancestor, our)
if not our_diff:
return self._from_dict(their)

their_diff = self._diff(ancestor, their)
if not their_diff:
return self._from_dict(our)

# make sure there are no conflicting files
self._diff(our, their, allow_removed=True)

merged = patch(our_diff + their_diff, ancestor, in_place=True)

# Sorting the list by path to ensure reproducibility
return sorted(
self._from_dict(merged), key=itemgetter(self.tree.PARAM_RELPATH),
filter_info.isin_or_eq(path_info / relpath)
for relpath, _ in self.get_dir_cache(hash_info).items()
)

def _get_dir_size(self, dir_info):
def _getsize(entry):
return self.tree.getsize(
self.tree.hash_to_path_info(entry[self.tree.PARAM_CHECKSUM])
)

try:
return sum(_getsize(entry) for entry in dir_info)
return sum(
self.tree.getsize(self.tree.hash_to_path_info(hi.value))
for _, hi in dir_info.items()
)
except FileNotFoundError:
return None
efiop marked this conversation as resolved.
Show resolved Hide resolved

Expand All @@ -704,12 +620,12 @@ def merge(self, ancestor_info, our_info, their_info):
if ancestor_info:
ancestor = self.get_dir_cache(ancestor_info)
else:
ancestor = []
ancestor = DirInfo()

our = self.get_dir_cache(our_info)
their = self.get_dir_cache(their_info)

merged = self._merge_dirs(ancestor, our, their)
merged = our.merge(ancestor, their)
hash_info = self.save_dir_info(merged)
hash_info.size = self._get_dir_size(merged)
return hash_info
Expand All @@ -728,5 +644,5 @@ def get_hash(self, tree, path_info):
def set_dir_info(self, hash_info):
assert hash_info.isdir

hash_info.dir_info = self._to_dict(self.get_dir_cache(hash_info))
hash_info.nfiles = len(hash_info.dir_info)
hash_info.dir_info = self.get_dir_cache(hash_info)
hash_info.nfiles = hash_info.dir_info.nfiles
102 changes: 102 additions & 0 deletions dvc/dir_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import posixpath
from operator import itemgetter

from pygtrie import Trie

from .hash_info import HashInfo


def _diff(ancestor, other, allow_removed=False):
from dictdiffer import diff

from dvc.exceptions import MergeError

allowed = ["add"]
if allow_removed:
allowed.append("remove")

result = list(diff(ancestor, other))
for typ, _, _ in result:
if typ not in allowed:
raise MergeError(
"unable to auto-merge directories with diff that contains "
f"'{typ}'ed files"
)
return result


def _merge(ancestor, our, their):
import copy

from dictdiffer import patch

our_diff = _diff(ancestor, our)
if not our_diff:
return copy.deepcopy(their)

their_diff = _diff(ancestor, their)
if not their_diff:
return copy.deepcopy(our)

# make sure there are no conflicting files
_diff(our, their, allow_removed=True)

return patch(our_diff + their_diff, ancestor)


class DirInfo:
PARAM_RELPATH = "relpath"

def __init__(self):
self.trie = Trie()

@property
def size(self):
try:
return sum(
hash_info.size
for _, hash_info in self.trie.iteritems() # noqa: B301
)
except TypeError:
return None

@property
def nfiles(self):
return len(self.trie)

def items(self, path_info=None):
for key, hash_info in self.trie.iteritems(): # noqa: B301
path = posixpath.sep.join(key)
if path_info is not None:
path = path_info / path
yield path, hash_info

@classmethod
def from_list(cls, lst):
ret = DirInfo()
for _entry in lst:
entry = _entry.copy()
relpath = entry.pop(cls.PARAM_RELPATH)
parts = tuple(relpath.split(posixpath.sep))
ret.trie[parts] = HashInfo.from_dict(entry)
return ret

def to_list(self):
# Sorting the list by path to ensure reproducibility
return sorted(
(
{
# NOTE: not using hash_info.to_dict() because we don't want
# size/nfiles fields at this point.
hash_info.name: hash_info.value,
self.PARAM_RELPATH: posixpath.sep.join(parts),
}
for parts, hash_info in self.trie.iteritems() # noqa: B301
),
key=itemgetter(self.PARAM_RELPATH),
)

def merge(self, ancestor, their):
merged = DirInfo()
merged.trie = _merge(ancestor.trie, self.trie, their.trie)
return merged
6 changes: 2 additions & 4 deletions dvc/output/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,9 +448,7 @@ def collect_used_dir_cache(
path = str(self.path_info)
filter_path = str(filter_info) if filter_info else None
is_win = os.name == "nt"
for entry in self.dir_cache:
checksum = entry[self.tree.PARAM_CHECKSUM]
entry_relpath = entry[self.tree.PARAM_RELPATH]
for entry_relpath, entry_hash_info in self.dir_cache.items():
if is_win:
entry_relpath = entry_relpath.replace("/", os.sep)
entry_path = os.path.join(path, entry_relpath)
Expand All @@ -459,7 +457,7 @@ def collect_used_dir_cache(
or entry_path == filter_path
or entry_path.startswith(filter_path + os.sep)
):
cache.add(self.scheme, checksum, entry_path)
cache.add(self.scheme, entry_hash_info.value, entry_path)

return cache

Expand Down
Loading