Skip to content

Commit

Permalink
dvc: move checkout out of cache
Browse files Browse the repository at this point in the history
Checkout is an operation that builds object from cache(aka object
database) in a particular tree(aka filesystem). Thus it shouldn't be
part of the object database itself nor of the filesystem.

Pre-requisite for iterative#5337
  • Loading branch information
efiop committed Feb 5, 2021
1 parent 8676617 commit d362f0d
Show file tree
Hide file tree
Showing 7 changed files with 319 additions and 308 deletions.
301 changes: 5 additions & 296 deletions dvc/cache/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,8 @@
from funcy import decorator
from shortuuid import uuid

import dvc.prompt as prompt
from dvc.dir_info import DirInfo
from dvc.exceptions import (
CacheLinkError,
CheckoutError,
ConfirmRemoveError,
DvcException,
)
from dvc.exceptions import CacheLinkError, DvcException
from dvc.progress import Tqdm
from dvc.remote.slow_link_detection import ( # type: ignore[attr-defined]
slow_link_guard,
Expand Down Expand Up @@ -93,77 +87,6 @@ def load_dir_cache(self, hash_info):

return DirInfo.from_list(d)

def _filter_hash_info(self, hash_info, path_info, filter_info):
if not filter_info or path_info == filter_info:
return hash_info

dir_info = self.get_dir_cache(hash_info)
hash_key = filter_info.relative_to(path_info).parts

# Check whether it is a file that exists on the trie
hi = dir_info.trie.get(hash_key)
if hi:
return hi

depth = len(hash_key)
filtered_dir_info = DirInfo()
try:
for key, value in dir_info.trie.items(hash_key):
filtered_dir_info.trie[key[depth:]] = value
except KeyError:
return None

return self._get_dir_info_hash(filtered_dir_info)[0]

def changed(self, path_info, tree, hash_info, filter_info=None):
"""Checks if data has changed.
A file is considered changed if:
- It doesn't exist on the working directory (was unlinked)
- Hash value is not computed (saving a new file)
- The hash value stored is different from the given one
- There's no file in the cache
Args:
path_info: dict with path information.
hash: expected hash value for this data.
filter_info: an optional argument to target a specific path.
Returns:
bool: True if data has changed, False otherwise.
"""

path = filter_info or path_info
logger.trace(
"checking if '%s'('%s') has changed.", path_info, hash_info
)

if not tree.exists(path):
logger.debug("'%s' doesn't exist.", path)
return True

hi = self._filter_hash_info(hash_info, path_info, filter_info)
if not hi:
logger.debug("hash value for '%s' is missing.", path)
return True

if self.changed_cache(hi):
logger.debug("cache for '%s'('%s') has changed.", path, hi)
return True

actual = tree.get_hash(path)
if hi != actual:
logger.debug(
"hash value '%s' for '%s' has changed (actual '%s').",
hi,
actual,
path,
)
return True

logger.trace("'%s' hasn't changed.", path)
return False

def link(self, from_info, to_info):
self._link(from_info, to_info, self.cache_types)

Expand Down Expand Up @@ -301,7 +224,7 @@ def _transfer_directory(
(
hash_info,
to_info,
) = local_cache._get_dir_info_hash( # pylint: disable=protected-access
) = local_cache.get_dir_info_hash( # pylint: disable=protected-access
dir_info
)

Expand All @@ -320,7 +243,7 @@ def transfer(self, from_tree, from_info, jobs=None, no_progress_bar=False):
)
return self._transfer_file(from_tree, from_info)

def _cache_is_copy(self, path_info):
def cache_is_copy(self, path_info):
"""Checks whether cache uses copies."""
if self.cache_type_confirmed:
return self.cache_types[0] == "copy"
Expand All @@ -342,7 +265,7 @@ def _cache_is_copy(self, path_info):
self.cache_type_confirmed = True
return self.cache_types[0] == "copy"

def _get_dir_info_hash(self, dir_info):
def get_dir_info_hash(self, dir_info):
import tempfile

from dvc.path_info import PathInfo
Expand Down Expand Up @@ -372,7 +295,7 @@ def save_dir_info(self, dir_info, hash_info=None):
):
return hash_info

hi, tmp_info = self._get_dir_info_hash(dir_info)
hi, tmp_info = self.get_dir_info_hash(dir_info)
new_info = self.tree.hash_to_path_info(hi.value)
if self.changed_cache_file(hi):
self.makedirs(new_info.parent)
Expand Down Expand Up @@ -515,220 +438,6 @@ def changed_cache(self, hash_info, path_info=None, filter_info=None):
)
return self.changed_cache_file(hash_info)

def already_cached(self, path_info, tree):
current = tree.get_hash(path_info)

if not current:
return False

return not self.changed_cache(current)

def safe_remove(self, path_info, tree, force=False):
if not tree.exists(path_info):
return

if not force and not self.already_cached(path_info, tree):
msg = (
"file '{}' is going to be removed."
" Are you sure you want to proceed?".format(str(path_info))
)

if not prompt.confirm(msg):
raise ConfirmRemoveError(str(path_info))

tree.remove(path_info)

def _checkout_file(
self,
path_info,
tree,
hash_info,
force,
progress_callback=None,
relink=False,
):
"""The file is changed we need to checkout a new copy"""
cache_info = self.tree.hash_to_path_info(hash_info.value)
if tree.exists(path_info):
added = False

if not relink and self.changed(path_info, tree, hash_info):
modified = True
self.safe_remove(path_info, tree, force=force)
self.link(cache_info, path_info)
else:
modified = False

if tree.iscopy(path_info) and self._cache_is_copy(path_info):
self.unprotect(path_info)
else:
self.safe_remove(path_info, tree, force=force)
self.link(cache_info, path_info)
else:
self.link(cache_info, path_info)
added, modified = True, False

tree.state.save(path_info, hash_info)
if progress_callback:
progress_callback(str(path_info))

return added, modified

def _checkout_dir(
self,
path_info,
tree,
hash_info,
force,
progress_callback=None,
relink=False,
filter_info=None,
):
added, modified = False, False
# Create dir separately so that dir is created
# even if there are no files in it
if not tree.exists(path_info):
added = True
self.makedirs(path_info)

dir_info = self.get_dir_cache(hash_info)

logger.debug("Linking directory '%s'.", path_info)

for entry_info, entry_hash_info in dir_info.items(path_info):
if filter_info and not entry_info.isin_or_eq(filter_info):
continue

entry_added, entry_modified = self._checkout_file(
entry_info,
tree,
entry_hash_info,
force,
progress_callback,
relink,
)
if entry_added or entry_modified:
modified = True

modified = (
self._remove_redundant_files(path_info, tree, dir_info, force)
or modified
)

tree.state.save(path_info, hash_info)

# relink is not modified, assume it as nochange
return added, not added and modified and not relink

def _remove_redundant_files(self, path_info, tree, dir_info, force):
existing_files = set(tree.walk_files(path_info))

needed_files = {info for info, _ in dir_info.items(path_info)}
redundant_files = existing_files - needed_files
for path in redundant_files:
self.safe_remove(path, tree, force)

return bool(redundant_files)

@use_state
def checkout(
self,
path_info,
tree,
hash_info,
force=False,
progress_callback=None,
relink=False,
filter_info=None,
quiet=False,
):
if path_info.scheme not in ["local", self.tree.scheme]:
raise NotImplementedError

failed = None
skip = False
if not hash_info:
if not quiet:
logger.warning(
"No file hash info found for '%s'. It won't be created.",
path_info,
)
self.safe_remove(path_info, tree, force=force)
failed = path_info

elif not relink and not self.changed(
path_info, tree, hash_info, filter_info=filter_info
):
logger.trace("Data '%s' didn't change.", path_info)
skip = True

elif self.changed_cache(
hash_info, path_info=path_info, filter_info=filter_info
):
if not quiet:
logger.warning(
"Cache '%s' not found. File '%s' won't be created.",
hash_info,
path_info,
)
self.safe_remove(path_info, tree, force=force)
failed = path_info

if failed or skip:
if progress_callback:
progress_callback(
str(path_info),
self.get_files_number(
self.tree.path_info, hash_info, filter_info
),
)
if failed:
raise CheckoutError([failed])
return

logger.debug(
"Checking out '%s' with cache '%s'.", path_info, hash_info
)

return self._checkout(
path_info,
tree,
hash_info,
force,
progress_callback,
relink,
filter_info,
)

def _checkout(
self,
path_info,
tree,
hash_info,
force=False,
progress_callback=None,
relink=False,
filter_info=None,
):
if not hash_info.isdir:
ret = self._checkout_file(
path_info, tree, hash_info, force, progress_callback, relink
)
else:
ret = self._checkout_dir(
path_info,
tree,
hash_info,
force,
progress_callback,
relink,
filter_info,
)

tree.state.save_link(path_info)

return ret

def get_files_number(self, path_info, hash_info, filter_info):
from funcy.py3 import ilen

Expand Down
5 changes: 0 additions & 5 deletions dvc/cache/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,6 @@ def hashes_exist(
)
]

def already_cached(self, path_info, tree):
assert path_info.scheme in ["", "local"]

return super().already_cached(path_info, tree)

def _verify_link(self, path_info, link_type):
if link_type == "hardlink" and self.tree.getsize(path_info) == 0:
return
Expand Down
Loading

0 comments on commit d362f0d

Please sign in to comment.