-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
remote: support saving RepoTree objects directly to cache #3825
Changes from all commits
6117cf0
18e611b
dd2ffa7
076dcfe
f1991b6
22594e1
de17469
deb729d
5bfd57d
ff28ac8
a60f8f5
6add867
a6dcaf3
2047d85
e79c923
821c507
4082dd2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,6 +26,7 @@ | |
from dvc.progress import Tqdm | ||
from dvc.remote.index import RemoteIndex, RemoteIndexNoop | ||
from dvc.remote.slow_link_detection import slow_link_guard | ||
from dvc.scm.tree import is_working_tree | ||
from dvc.state import StateNoop | ||
from dvc.utils import tmp_fname | ||
from dvc.utils.fs import makedirs, move | ||
|
@@ -253,13 +254,17 @@ def get_dir_checksum(self, path_info): | |
raise RemoteCacheRequiredError(path_info) | ||
|
||
dir_info = self._collect_dir(path_info) | ||
return self._save_dir_info(dir_info, path_info) | ||
|
||
def _save_dir_info(self, dir_info, path_info=None): | ||
checksum, tmp_info = self._get_dir_info_checksum(dir_info) | ||
new_info = self.cache.checksum_to_path_info(checksum) | ||
if self.cache.changed_cache_file(checksum): | ||
self.cache.makedirs(new_info.parent) | ||
self.cache.move(tmp_info, new_info, mode=self.CACHE_MODE) | ||
|
||
self.state.save(path_info, checksum) | ||
if path_info: | ||
self.state.save(path_info, checksum) | ||
self.state.save(new_info, checksum) | ||
|
||
return checksum | ||
|
@@ -452,27 +457,33 @@ def _do_link(self, from_info, to_info, link_method): | |
"Created '%s': %s -> %s", self.cache_types[0], from_info, to_info, | ||
) | ||
|
||
def _save_file(self, path_info, checksum, save_link=True): | ||
def _save_file(self, path_info, checksum, save_link=True, tree=None): | ||
assert checksum | ||
|
||
cache_info = self.checksum_to_path_info(checksum) | ||
if self.changed_cache(checksum): | ||
self.move(path_info, cache_info, mode=self.CACHE_MODE) | ||
self.link(cache_info, path_info) | ||
elif self.iscopy(path_info) and self._cache_is_copy(path_info): | ||
# Default relink procedure involves unneeded copy | ||
self.unprotect(path_info) | ||
if tree: | ||
if self.changed_cache(checksum): | ||
with tree.open(path_info, mode="rb") as fobj: | ||
self.copy_fobj(fobj, cache_info) | ||
else: | ||
self.remove(path_info) | ||
self.link(cache_info, path_info) | ||
if self.changed_cache(checksum): | ||
self.move(path_info, cache_info, mode=self.CACHE_MODE) | ||
self.link(cache_info, path_info) | ||
elif self.iscopy(path_info) and self._cache_is_copy(path_info): | ||
# Default relink procedure involves unneeded copy | ||
self.unprotect(path_info) | ||
else: | ||
self.remove(path_info) | ||
self.link(cache_info, path_info) | ||
|
||
if save_link: | ||
self.state.save_link(path_info) | ||
if save_link: | ||
self.state.save_link(path_info) | ||
|
||
# we need to update path and cache, since in case of reflink, | ||
# or copy cache type moving original file results in updates on | ||
# next executed command, which causes md5 recalculation | ||
self.state.save(path_info, checksum) | ||
if not tree or is_working_tree(tree): | ||
self.state.save(path_info, checksum) | ||
self.state.save(cache_info, checksum) | ||
|
||
def _cache_is_copy(self, path_info): | ||
|
@@ -497,22 +508,43 @@ def _cache_is_copy(self, path_info): | |
self.cache_type_confirmed = True | ||
return self.cache_types[0] == "copy" | ||
|
||
def _save_dir(self, path_info, checksum, save_link=True): | ||
cache_info = self.checksum_to_path_info(checksum) | ||
dir_info = self.get_dir_cache(checksum) | ||
def _save_dir(self, path_info, checksum, save_link=True, tree=None): | ||
if tree: | ||
checksum = self._save_tree(path_info, tree) | ||
else: | ||
dir_info = self.get_dir_cache(checksum) | ||
Comment on lines
+512
to
+515
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We've discussed this before, but just to clarify: I suppose we need this because we are not ready to make There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So we can add tree support in
But walking the tree twice like this seems unnecessary for this use case, since we can save objects from the tree and collect the dir cache info at the same time during a single walk, and I wasn't sure if saving objects from tree during the walk would be desired behavior for Thinking about it now, what we could do is add the |
||
|
||
for entry in Tqdm( | ||
dir_info, desc="Saving " + path_info.name, unit="file" | ||
): | ||
entry_info = path_info / entry[self.PARAM_RELPATH] | ||
entry_checksum = entry[self.PARAM_CHECKSUM] | ||
self._save_file(entry_info, entry_checksum, save_link=False) | ||
for entry in Tqdm( | ||
dir_info, desc="Saving " + path_info.name, unit="file" | ||
): | ||
entry_info = path_info / entry[self.PARAM_RELPATH] | ||
entry_checksum = entry[self.PARAM_CHECKSUM] | ||
self._save_file(entry_info, entry_checksum, save_link=False) | ||
|
||
if save_link: | ||
self.state.save_link(path_info) | ||
if save_link: | ||
self.state.save_link(path_info) | ||
|
||
cache_info = self.checksum_to_path_info(checksum) | ||
self.state.save(cache_info, checksum) | ||
self.state.save(path_info, checksum) | ||
if not tree or is_working_tree(tree): | ||
self.state.save(path_info, checksum) | ||
|
||
def _save_tree(self, path_info, tree): | ||
# save tree directory to cache, collect dir cache during walk and | ||
# return the resulting dir checksum | ||
dir_info = [] | ||
for fname in tree.walk_files(path_info): | ||
checksum = tree.get_file_checksum(fname) | ||
file_info = { | ||
self.PARAM_CHECKSUM: checksum, | ||
self.PARAM_RELPATH: fname.relative_to(path_info).as_posix(), | ||
} | ||
self._save_file(fname, checksum, tree=tree) | ||
dir_info.append(file_info) | ||
Comment on lines
+536
to
+543
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The actual behavior for this Git files will always be saved directly from the tree in
Basically, for erepo's and import/get, we probably always want the behavior from |
||
|
||
return self._save_dir_info( | ||
sorted(dir_info, key=itemgetter(self.PARAM_RELPATH)) | ||
) | ||
|
||
def is_empty(self, path_info): | ||
return False | ||
|
@@ -541,22 +573,36 @@ def walk_files(self, path_info): | |
def protect(path_info): | ||
pass | ||
|
||
def save(self, path_info, checksum_info, save_link=True): | ||
def save(self, path_info, checksum_info, save_link=True, tree=None): | ||
if path_info.scheme != self.scheme: | ||
raise RemoteActionNotImplemented( | ||
f"save {path_info.scheme} -> {self.scheme}", self.scheme, | ||
) | ||
|
||
checksum = checksum_info[self.PARAM_CHECKSUM] | ||
self._save(path_info, checksum, save_link) | ||
if tree: | ||
# save checksum will be computed during tree walk | ||
checksum = None | ||
else: | ||
checksum = checksum_info[self.PARAM_CHECKSUM] | ||
self._save(path_info, checksum, save_link, tree) | ||
|
||
def _save(self, path_info, checksum, save_link=True, tree=None): | ||
if tree: | ||
logger.debug("Saving tree path '%s' to cache.", path_info) | ||
else: | ||
to_info = self.checksum_to_path_info(checksum) | ||
logger.debug("Saving '%s' to '%s'.", path_info, to_info) | ||
|
||
def _save(self, path_info, checksum, save_link=True): | ||
to_info = self.checksum_to_path_info(checksum) | ||
logger.debug("Saving '%s' to '%s'.", path_info, to_info) | ||
if self.isdir(path_info): | ||
self._save_dir(path_info, checksum, save_link) | ||
if tree: | ||
isdir = tree.isdir | ||
save_link = False | ||
else: | ||
isdir = self.isdir | ||
Comment on lines
+596
to
+600
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like how RemoteTree(or something like that) becomes more and more apparent π And that the state should probably belong to the tree. So here it would be a noop for git tree. Not saying we should implement all of that right now, just noticing the things that we've discussed earlier π |
||
|
||
if isdir(path_info): | ||
self._save_dir(path_info, checksum, save_link, tree) | ||
return | ||
self._save_file(path_info, checksum, save_link) | ||
self._save_file(path_info, checksum, save_link, tree) | ||
|
||
def _handle_transfer_exception( | ||
self, from_info, to_info, exception, operation | ||
|
@@ -695,6 +741,9 @@ def move(self, from_info, to_info, mode=None): | |
def copy(self, from_info, to_info): | ||
raise RemoteActionNotImplemented("copy", self.scheme) | ||
|
||
def copy_fobj(self, fobj, to_info): | ||
raise RemoteActionNotImplemented("copy_fobj", self.scheme) | ||
|
||
def symlink(self, from_info, to_info): | ||
raise RemoteActionNotImplemented("symlink", self.scheme) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could also think about making
save()
only save without checking out(theselink
s are effectivelycheckout
). Thisif&else
makes it clear that this method tries to do too much. But at the same timemove
is a very nice optimization for big files that allows us to make it happen instantly if we are within the same fs.I would be fine with keeping it as is for now, but let's at least keep this in mind.