diff --git a/dvc/ignore.py b/dvc/ignore.py index b9d4de35af..eefa3bfef8 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -6,7 +6,6 @@ from pathspec.patterns import GitWildMatchPattern from dvc.scm.tree import BaseTree -from dvc.utils import relpath logger = logging.getLogger(__name__) @@ -35,12 +34,19 @@ def __call__(self, root, dirs, files): return dirs, files def matches(self, dirname, basename): - abs_path = os.path.join(dirname, basename) - rel_path = relpath(abs_path, self.dirname) - - if os.pardir + os.sep in rel_path: + # NOTE: `relpath` is too slow, so we have to assume that both + # `dirname` and `self.dirname` are relative or absolute together. + prefix = self.dirname + os.sep + if dirname == self.dirname: + path = basename + elif dirname.startswith(prefix): + rel = dirname[len(prefix) :] + # NOTE: `os.path.join` is ~x5.5 slower + path = f"{rel}{os.sep}{basename}" + else: return False - return self.ignore_spec.match_file(rel_path) + + return self.ignore_spec.match_file(path) def __hash__(self): return hash(self.ignore_file_path) @@ -135,7 +141,9 @@ def isexec(self, path): def walk(self, top, topdown=True): for root, dirs, files in self.tree.walk(top, topdown): - dirs[:], files[:] = self.dvcignore(root, dirs, files) + dirs[:], files[:] = self.dvcignore( + os.path.abspath(root), dirs, files + ) yield root, dirs, files diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 8b338fd07b..8c6d6ac1d7 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -831,7 +831,7 @@ def all(self, jobs=None, name=None): @index_locked def gc(self, named_cache, jobs=None): - used = self.extract_used_local_checksums(named_cache) + used = set(named_cache.scheme_keys("local")) if self.scheme != "": used.update(named_cache.scheme_keys(self.scheme)) @@ -847,6 +847,7 @@ def gc(self, named_cache, jobs=None): continue path_info = self.checksum_to_path_info(checksum) if self.is_dir_checksum(checksum): + # backward compatibility self._remove_unpacked_dir(checksum) self.remove(path_info) removed = True @@ -905,11 +906,6 @@ def _changed_dir_cache(self, checksum, path_info=None, filter_info=None): if self.changed_cache_file(checksum): return True - if not (path_info and filter_info) and not self._changed_unpacked_dir( - checksum - ): - return False - for entry in self.get_dir_cache(checksum): entry_checksum = entry[self.PARAM_CHECKSUM] @@ -921,9 +917,6 @@ def _changed_dir_cache(self, checksum, path_info=None, filter_info=None): if self.changed_cache_file(entry_checksum): return True - if not (path_info and filter_info): - self._update_unpacked_dir(checksum) - return False def changed_cache(self, checksum, path_info=None, filter_info=None): @@ -1362,19 +1355,5 @@ def get_files_number(self, path_info, checksum, filter_info): def unprotect(path_info): pass - def _get_unpacked_dir_names(self, checksums): - return set() - - def extract_used_local_checksums(self, named_cache): - used = set(named_cache.scheme_keys("local")) - unpacked = self._get_unpacked_dir_names(used) - return used | unpacked - - def _changed_unpacked_dir(self, checksum): - return True - - def _update_unpacked_dir(self, checksum): - pass - def _remove_unpacked_dir(self, checksum): pass diff --git a/dvc/remote/local.py b/dvc/remote/local.py index d4e613353e..d9a347f687 100644 --- a/dvc/remote/local.py +++ b/dvc/remote/local.py @@ -92,7 +92,12 @@ def cache_path(self): return os.path.abspath(self.cache_dir) def checksum_to_path(self, checksum): - return os.path.join(self.cache_path, checksum[0:2], checksum[2:]) + # NOTE: `self.cache_path` is already normalized so we can simply use + # `os.sep` instead of `os.path.join`. This results in this helper + # being ~5.5 times faster. + return ( + f"{self.cache_path}{os.sep}{checksum[0:2]}{os.sep}{checksum[2:]}" + ) def list_cache_paths(self, prefix=None, progress_callback=None): assert self.path_info is not None @@ -695,70 +700,15 @@ def protect(self, path_info): if actual != mode: raise - def _get_unpacked_dir_path_info(self, checksum): - info = self.checksum_to_path_info(checksum) - return info.with_name(info.name + self.UNPACKED_DIR_SUFFIX) - def _remove_unpacked_dir(self, checksum): - path_info = self._get_unpacked_dir_path_info(checksum) + info = self.checksum_to_path_info(checksum) + path_info = info.with_name(info.name + self.UNPACKED_DIR_SUFFIX) self.remove(path_info) - def _path_info_changed(self, path_info): - if self.exists(path_info) and self.state.get(path_info): - return False - return True - - def _update_unpacked_dir(self, checksum): - unpacked_dir_info = self._get_unpacked_dir_path_info(checksum) - - if not self._path_info_changed(unpacked_dir_info): - return - - self.remove(unpacked_dir_info) - - try: - dir_info = self.get_dir_cache(checksum) - self._create_unpacked_dir(checksum, dir_info, unpacked_dir_info) - except DvcException: - logger.warning(f"Could not create '{unpacked_dir_info}'") - - self.remove(unpacked_dir_info) - - def _create_unpacked_dir(self, checksum, dir_info, unpacked_dir_info): - self.makedirs(unpacked_dir_info) - - for entry in Tqdm(dir_info, desc="Creating unpacked dir", unit="file"): - entry_cache_info = self.checksum_to_path_info( - entry[self.PARAM_CHECKSUM] - ) - relative_path = entry[self.PARAM_RELPATH] - # In shared cache mode some cache files might not be owned by the - # user, so we need to use symlinks because, unless - # /proc/sys/fs/protected_hardlinks is disabled, the user is not - # allowed to create hardlinks to files that he doesn't own. - link_types = ["hardlink", "symlink"] - self._link( - entry_cache_info, unpacked_dir_info / relative_path, link_types - ) - - self.state.save(unpacked_dir_info, checksum) - - def _changed_unpacked_dir(self, checksum): - status_unpacked_dir_info = self._get_unpacked_dir_path_info(checksum) - - return not self.state.get(status_unpacked_dir_info) - - def _get_unpacked_dir_names(self, checksums): - unpacked = set() - for c in checksums: - if self.is_dir_checksum(c): - unpacked.add(c + self.UNPACKED_DIR_SUFFIX) - return unpacked - def is_protected(self, path_info): - if not self.exists(path_info): + try: + mode = os.stat(path_info).st_mode + except FileNotFoundError: return False - mode = os.stat(path_info).st_mode - return stat.S_IMODE(mode) == self.CACHE_MODE diff --git a/dvc/scm/tree.py b/dvc/scm/tree.py index f65d8eefb8..7390b6e265 100644 --- a/dvc/scm/tree.py +++ b/dvc/scm/tree.py @@ -32,7 +32,8 @@ def walk(self, top, topdown=True): def walk_files(self, top): for root, _, files in self.walk(top): for file in files: - yield os.path.join(root, file) + # NOTE: os.path.join is ~5.5 times slower + yield f"{root}{os.sep}{file}" class WorkingTree(BaseTree): diff --git a/tests/func/remote/test_local.py b/tests/func/remote/test_local.py deleted file mode 100644 index 8fb109c309..0000000000 --- a/tests/func/remote/test_local.py +++ /dev/null @@ -1,65 +0,0 @@ -import os - -import mock - -from dvc.exceptions import DvcException -from dvc.remote.local import LocalRemote -from tests.utils import trees_equal - - -def test_dont_fail_on_unpacked_create_fail(tmp_dir, dvc): - (stage,) = tmp_dir.dvc_gen({"dir": {"file": "file_content"}}) - - with mock.patch.object( - LocalRemote, "_create_unpacked_dir", side_effect=DvcException("msg") - ) as unpacked_create_spy, dvc.state: - assert not dvc.cache.local.changed_cache(stage.outs[0].checksum) - assert unpacked_create_spy.call_count == 1 - - -def test_remove_unpacked_on_create_fail(tmp_dir, dvc): - (stage,) = tmp_dir.dvc_gen({"dir": {"file": "file_content"}}) - unpacked_dir = stage.outs[0].cache_path + LocalRemote.UNPACKED_DIR_SUFFIX - - # artificial unpacked dir for test purpose - os.makedirs(unpacked_dir) - assert os.path.exists(unpacked_dir) - - with mock.patch.object( - LocalRemote, "_create_unpacked_dir", side_effect=DvcException("msg") - ), dvc.state: - assert not dvc.cache.local.changed_cache(stage.outs[0].checksum) - - assert not os.path.exists(unpacked_dir) - - -def test_create_unpacked_on_status(tmp_dir, dvc): - (stage,) = tmp_dir.dvc_gen({"dir": {"file": "file_content"}}) - unpacked_dir = stage.outs[0].cache_path + LocalRemote.UNPACKED_DIR_SUFFIX - assert not os.path.exists(unpacked_dir) - - with dvc.state: - assert not dvc.cache.local.changed_cache(stage.outs[0].checksum) - assert os.path.exists(unpacked_dir) - trees_equal("dir", unpacked_dir) - - -def test_dir_cache_changed_on_single_cache_file_modification(tmp_dir, dvc): - (stage,) = tmp_dir.dvc_gen( - {"dir": {"file1": "file1 content", "file2": "file2 content"}} - ) - unpacked_dir = stage.outs[0].cache_path + LocalRemote.UNPACKED_DIR_SUFFIX - assert not os.path.exists(unpacked_dir) - file_md5 = stage.outs[0].dir_cache[0]["md5"] - - with dvc.state: - assert not dvc.cache.local.changed_cache(stage.outs[0].checksum) - assert os.path.exists(unpacked_dir) - - cache_file_path = dvc.cache.local.get(file_md5) - os.chmod(cache_file_path, 0o644) - with open(cache_file_path, "a") as fobj: - fobj.write("modification") - - with dvc.state: - assert dvc.cache.local.changed_cache(stage.outs[0].checksum) diff --git a/tests/func/test_gc.py b/tests/func/test_gc.py index 5815b06e9e..0ddb6e025a 100644 --- a/tests/func/test_gc.py +++ b/tests/func/test_gc.py @@ -219,6 +219,8 @@ def test_gc_no_unpacked_dir(tmp_dir, dvc): dir_stages[0].outs[0].cache_path + LocalRemote.UNPACKED_DIR_SUFFIX ) + # older (pre 1.0) versions of dvc used to generate this dir + shutil.copytree("dir", unpackeddir) assert os.path.exists(unpackeddir) dvc.gc(force=True, workspace=True) diff --git a/tests/unit/test_ignore.py b/tests/unit/test_ignore.py index 71672e0694..1f7c4d8fc3 100644 --- a/tests/unit/test_ignore.py +++ b/tests/unit/test_ignore.py @@ -69,7 +69,6 @@ def test_ignore_from_file_should_filter_dirs_and_files(): ), ("dont_ignore.txt", ["dont_ignore"], False), ("dont_ignore.txt", ["dont*", "!dont_ignore.txt"], False), - ("../../../something.txt", ["**/something.txt"], False), ], ) def test_match_ignore_from_file(