Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

optimize local status #3867

Merged
merged 5 commits into from
May 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 15 additions & 7 deletions dvc/ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from pathspec.patterns import GitWildMatchPattern

from dvc.scm.tree import BaseTree
from dvc.utils import relpath

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -35,12 +34,19 @@ def __call__(self, root, dirs, files):
return dirs, files

def matches(self, dirname, basename):
abs_path = os.path.join(dirname, basename)
rel_path = relpath(abs_path, self.dirname)

if os.pardir + os.sep in rel_path:
# NOTE: `relpath` is too slow, so we have to assume that both
# `dirname` and `self.dirname` are relative or absolute together.
prefix = self.dirname + os.sep
if dirname == self.dirname:
path = basename
elif dirname.startswith(prefix):
rel = dirname[len(prefix) :]
# NOTE: `os.path.join` is ~x5.5 slower
path = f"{rel}{os.sep}{basename}"
else:
return False
return self.ignore_spec.match_file(rel_path)

return self.ignore_spec.match_file(path)

def __hash__(self):
return hash(self.ignore_file_path)
Expand Down Expand Up @@ -135,7 +141,9 @@ def isexec(self, path):

def walk(self, top, topdown=True):
for root, dirs, files in self.tree.walk(top, topdown):
dirs[:], files[:] = self.dvcignore(root, dirs, files)
dirs[:], files[:] = self.dvcignore(
os.path.abspath(root), dirs, files
)

yield root, dirs, files

Expand Down
25 changes: 2 additions & 23 deletions dvc/remote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,7 +831,7 @@ def all(self, jobs=None, name=None):

@index_locked
def gc(self, named_cache, jobs=None):
used = self.extract_used_local_checksums(named_cache)
used = set(named_cache.scheme_keys("local"))

if self.scheme != "":
used.update(named_cache.scheme_keys(self.scheme))
Expand All @@ -847,6 +847,7 @@ def gc(self, named_cache, jobs=None):
continue
path_info = self.checksum_to_path_info(checksum)
if self.is_dir_checksum(checksum):
# backward compatibility
self._remove_unpacked_dir(checksum)
self.remove(path_info)
removed = True
Expand Down Expand Up @@ -905,11 +906,6 @@ def _changed_dir_cache(self, checksum, path_info=None, filter_info=None):
if self.changed_cache_file(checksum):
return True

if not (path_info and filter_info) and not self._changed_unpacked_dir(
checksum
):
return False

for entry in self.get_dir_cache(checksum):
entry_checksum = entry[self.PARAM_CHECKSUM]

Expand All @@ -921,9 +917,6 @@ def _changed_dir_cache(self, checksum, path_info=None, filter_info=None):
if self.changed_cache_file(entry_checksum):
return True

if not (path_info and filter_info):
self._update_unpacked_dir(checksum)

return False

def changed_cache(self, checksum, path_info=None, filter_info=None):
Expand Down Expand Up @@ -1362,19 +1355,5 @@ def get_files_number(self, path_info, checksum, filter_info):
def unprotect(path_info):
pass

def _get_unpacked_dir_names(self, checksums):
return set()

def extract_used_local_checksums(self, named_cache):
used = set(named_cache.scheme_keys("local"))
unpacked = self._get_unpacked_dir_names(used)
return used | unpacked

def _changed_unpacked_dir(self, checksum):
return True

def _update_unpacked_dir(self, checksum):
pass

def _remove_unpacked_dir(self, checksum):
pass
72 changes: 11 additions & 61 deletions dvc/remote/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,12 @@ def cache_path(self):
return os.path.abspath(self.cache_dir)

def checksum_to_path(self, checksum):
return os.path.join(self.cache_path, checksum[0:2], checksum[2:])
# NOTE: `self.cache_path` is already normalized so we can simply use
# `os.sep` instead of `os.path.join`. This results in this helper
# being ~5.5 times faster.
return (
f"{self.cache_path}{os.sep}{checksum[0:2]}{os.sep}{checksum[2:]}"
)

def list_cache_paths(self, prefix=None, progress_callback=None):
assert self.path_info is not None
Expand Down Expand Up @@ -695,70 +700,15 @@ def protect(self, path_info):
if actual != mode:
raise

def _get_unpacked_dir_path_info(self, checksum):
info = self.checksum_to_path_info(checksum)
return info.with_name(info.name + self.UNPACKED_DIR_SUFFIX)

def _remove_unpacked_dir(self, checksum):
path_info = self._get_unpacked_dir_path_info(checksum)
info = self.checksum_to_path_info(checksum)
path_info = info.with_name(info.name + self.UNPACKED_DIR_SUFFIX)
self.remove(path_info)

def _path_info_changed(self, path_info):
if self.exists(path_info) and self.state.get(path_info):
return False
return True

def _update_unpacked_dir(self, checksum):
unpacked_dir_info = self._get_unpacked_dir_path_info(checksum)

if not self._path_info_changed(unpacked_dir_info):
return

self.remove(unpacked_dir_info)

try:
dir_info = self.get_dir_cache(checksum)
self._create_unpacked_dir(checksum, dir_info, unpacked_dir_info)
except DvcException:
logger.warning(f"Could not create '{unpacked_dir_info}'")

self.remove(unpacked_dir_info)

def _create_unpacked_dir(self, checksum, dir_info, unpacked_dir_info):
self.makedirs(unpacked_dir_info)

for entry in Tqdm(dir_info, desc="Creating unpacked dir", unit="file"):
entry_cache_info = self.checksum_to_path_info(
entry[self.PARAM_CHECKSUM]
)
relative_path = entry[self.PARAM_RELPATH]
# In shared cache mode some cache files might not be owned by the
# user, so we need to use symlinks because, unless
# /proc/sys/fs/protected_hardlinks is disabled, the user is not
# allowed to create hardlinks to files that he doesn't own.
link_types = ["hardlink", "symlink"]
self._link(
entry_cache_info, unpacked_dir_info / relative_path, link_types
)

self.state.save(unpacked_dir_info, checksum)

def _changed_unpacked_dir(self, checksum):
status_unpacked_dir_info = self._get_unpacked_dir_path_info(checksum)

return not self.state.get(status_unpacked_dir_info)

def _get_unpacked_dir_names(self, checksums):
unpacked = set()
for c in checksums:
if self.is_dir_checksum(c):
unpacked.add(c + self.UNPACKED_DIR_SUFFIX)
return unpacked

def is_protected(self, path_info):
if not self.exists(path_info):
try:
mode = os.stat(path_info).st_mode
except FileNotFoundError:
return False

mode = os.stat(path_info).st_mode

return stat.S_IMODE(mode) == self.CACHE_MODE
3 changes: 2 additions & 1 deletion dvc/scm/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ def walk(self, top, topdown=True):
def walk_files(self, top):
for root, _, files in self.walk(top):
for file in files:
yield os.path.join(root, file)
# NOTE: os.path.join is ~5.5 times slower
yield f"{root}{os.sep}{file}"


class WorkingTree(BaseTree):
Expand Down
65 changes: 0 additions & 65 deletions tests/func/remote/test_local.py

This file was deleted.

2 changes: 2 additions & 0 deletions tests/func/test_gc.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,8 @@ def test_gc_no_unpacked_dir(tmp_dir, dvc):
dir_stages[0].outs[0].cache_path + LocalRemote.UNPACKED_DIR_SUFFIX
)

# older (pre 1.0) versions of dvc used to generate this dir
shutil.copytree("dir", unpackeddir)
assert os.path.exists(unpackeddir)

dvc.gc(force=True, workspace=True)
Expand Down
1 change: 0 additions & 1 deletion tests/unit/test_ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ def test_ignore_from_file_should_filter_dirs_and_files():
),
("dont_ignore.txt", ["dont_ignore"], False),
("dont_ignore.txt", ["dont*", "!dont_ignore.txt"], False),
("../../../something.txt", ["**/something.txt"], False),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is too expensive for match to try to resolve paths like this.

],
)
def test_match_ignore_from_file(
Expand Down