From a80a85e6bd144189bf63df535483ae628136ce14 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Fri, 22 Apr 2022 19:35:58 +0300 Subject: [PATCH] dvc: detach gitfs from local paths Our gitfs paths are currently tied to the local path in which the cloned repository resides in, which results in a lot of hacky assumptions throughout the code. This PR detaches gitfs from local paths completely, making dvc work with a proper fsspec gitfs implementation. Part of #7543 --- dvc/config.py | 11 +++-- dvc/external_repo.py | 20 +++++---- dvc/fs/git.py | 12 ++---- dvc/fs/local.py | 2 +- dvc/fs/path.py | 18 +++++--- dvc/fs/repo.py | 18 ++++++-- dvc/ignore.py | 10 +++-- dvc/output.py | 26 ++++++++---- dvc/parsing/__init__.py | 10 +---- dvc/parsing/context.py | 2 +- dvc/pathspec_math.py | 5 +-- dvc/repo/__init__.py | 49 ++++++++++++++-------- dvc/repo/brancher.py | 18 ++++++++ dvc/repo/collect.py | 3 +- dvc/repo/diff.py | 22 ++++------ dvc/repo/params/show.py | 29 ++++--------- dvc/repo/plots/__init__.py | 19 +++++---- dvc/repo/stage.py | 13 +++--- dvc/stage/loader.py | 6 ++- dvc/stage/utils.py | 6 +-- dvc/utils/__init__.py | 2 +- setup.cfg | 2 +- tests/func/experiments/test_experiments.py | 28 ++++++------- tests/func/parsing/test_resolver.py | 9 ++-- tests/func/test_fs.py | 2 +- tests/func/test_ignore.py | 27 +++--------- tests/func/test_used_objs.py | 18 ++------ tests/unit/stage/test_utils.py | 9 ++-- tests/unit/test_external_repo.py | 3 +- tests/unit/test_ignore.py | 1 + 30 files changed, 208 insertions(+), 192 deletions(-) diff --git a/dvc/config.py b/dvc/config.py index caa62e785f..61264539ea 100644 --- a/dvc/config.py +++ b/dvc/config.py @@ -89,6 +89,8 @@ def __init__( from dvc.fs.local import LocalFileSystem self.dvc_dir = dvc_dir + self.wfs = LocalFileSystem() + self.fs = fs or self.wfs if not dvc_dir: try: @@ -98,10 +100,7 @@ def __init__( except NotDvcRepoError: self.dvc_dir = None else: - self.dvc_dir = os.path.abspath(os.path.realpath(dvc_dir)) - - self.wfs = LocalFileSystem(url=self.dvc_dir) - self.fs = fs or self.wfs + self.dvc_dir = self.fs.path.abspath(self.fs.path.realpath(dvc_dir)) self.load(validate=validate, config=config) @@ -124,8 +123,8 @@ def files(self): } if self.dvc_dir is not None: - files["repo"] = os.path.join(self.dvc_dir, self.CONFIG) - files["local"] = os.path.join(self.dvc_dir, self.CONFIG_LOCAL) + files["repo"] = self.fs.path.join(self.dvc_dir, self.CONFIG) + files["local"] = self.fs.path.join(self.dvc_dir, self.CONFIG_LOCAL) return files diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 38aecf7276..16a59a3007 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -50,19 +50,21 @@ def external_repo( config.update(cache_config) if for_write: + scm = None root_dir = path fs = None else: - root_dir = os.path.realpath(path) - scm = Git(root_dir) + scm = Git(path) fs = GitFileSystem(scm=scm, rev=rev) + root_dir = "/" repo_kwargs = dict( root_dir=root_dir, url=url, fs=fs, config=config, - repo_factory=erepo_factory(url, cache_config), + repo_factory=erepo_factory(url, root_dir, cache_config), + scm=scm, **kwargs, ) @@ -92,14 +94,16 @@ def external_repo( _remove(path) -def erepo_factory(url, cache_config, *args, **kwargs): - def make_repo(path, **_kwargs): +def erepo_factory(url, root_dir, cache_config): + from dvc.fs.local import localfs + + def make_repo(path, fs=None, **_kwargs): _config = cache_config.copy() if os.path.isdir(url): - rel = os.path.relpath(path, _kwargs["fs"].fs_args["scm"].root_dir) - repo_path = os.path.join(url, rel) + fs = fs or localfs + repo_path = os.path.join(url, *fs.path.relparts(path, root_dir)) _config.update(_get_remote_config(repo_path)) - return Repo(path, config=_config, **_kwargs) + return Repo(path, fs=fs, config=_config, **_kwargs) return make_repo diff --git a/dvc/fs/git.py b/dvc/fs/git.py index 4eab7b098f..3b20f363d0 100644 --- a/dvc/fs/git.py +++ b/dvc/fs/git.py @@ -1,4 +1,3 @@ -import os import threading from typing import TYPE_CHECKING, Any @@ -15,7 +14,6 @@ class GitFileSystem(FileSystem): # pylint:disable=abstract-method """Proxies the repo file access methods to Git objects""" - sep = os.sep scheme = "local" def __init__( @@ -40,12 +38,6 @@ def __init__( } ) - @cached_property - def path(self): - from .path import Path - - return Path(self.sep, getcwd=os.getcwd) - @wrap_prop(threading.Lock()) @cached_property def fs(self) -> "FsspecGitFileSystem": @@ -53,6 +45,10 @@ def fs(self) -> "FsspecGitFileSystem": return FsspecGitFileSystem(**self.fs_args) + @cached_property + def path(self): + return self.fs.path + @property def rev(self) -> str: return self.fs.rev diff --git a/dvc/fs/local.py b/dvc/fs/local.py index c0421982e5..aea332299d 100644 --- a/dvc/fs/local.py +++ b/dvc/fs/local.py @@ -182,7 +182,7 @@ def fs(self): def path(self): from .path import Path - return Path(self.sep, os.getcwd) + return Path(self.sep, getcwd=os.getcwd, realpath=os.path.realpath) def upload_fobj(self, fobj, to_info, **kwargs): self.makedirs(self.path.parent(to_info)) diff --git a/dvc/fs/path.py b/dvc/fs/path.py index 9e734eac9d..918f269222 100644 --- a/dvc/fs/path.py +++ b/dvc/fs/path.py @@ -3,11 +3,13 @@ class Path: - def __init__(self, sep, getcwd=None): + def __init__(self, sep, getcwd=None, realpath=None): def _getcwd(): return "" self.getcwd = getcwd or _getcwd + self.realpath = realpath or self.abspath + if sep == posixpath.sep: self.flavour = posixpath elif sep == ntpath.sep: @@ -15,6 +17,12 @@ def _getcwd(): else: raise ValueError(f"unsupported separator '{sep}'") + def chdir(self, path): + def _getcwd(): + return path + + self.getcwd = _getcwd + def join(self, *parts): return self.flavour.join(*parts) @@ -104,15 +112,15 @@ def overlaps(self, left, right): # pylint: disable=arguments-out-of-order return self.isin_or_eq(left, right) or self.isin(right, left) - def relpath(self, path, start): - if not start: + def relpath(self, path, start=None): + if start is None: start = "." return self.flavour.relpath( self.abspath(path), start=self.abspath(start) ) - def relparts(self, path, base): - return self.parts(self.relpath(path, base)) + def relparts(self, path, start=None): + return self.parts(self.relpath(path, start=start)) def as_posix(self, path): return path.replace(self.flavour.sep, posixpath.sep) diff --git a/dvc/fs/repo.py b/dvc/fs/repo.py index 8d1977e4f3..a9728b63e8 100644 --- a/dvc/fs/repo.py +++ b/dvc/fs/repo.py @@ -106,7 +106,12 @@ def __init__( self.repo_factory = repo_factory def _getcwd(): - return self.root_marker + relparts = () + if repo.fs.path.isin(repo.fs.path.getcwd(), repo.root_dir): + relparts = repo.fs.path.relparts( + repo.fs.path.getcwd(), repo.root_dir + ) + return self.root_marker + self.sep.join(relparts) self.path = Path(self.sep, getcwd=_getcwd) self.repo = repo @@ -190,7 +195,7 @@ def _open(*args, **kwargs): repo_kwargs["config"] = cache_config else: repo_kwargs["cache_dir"] = cache_dir - factory = erepo_factory(url, cache_config) + factory = erepo_factory(url, root, cache_config) with _open( url if url else root, @@ -237,6 +242,7 @@ def _update(self, dirs, starting_repo): repo = self.repo_factory( d, fs=self.repo.fs, + scm=self.repo.scm, repo_factory=self.repo_factory, ) self._dvcfss[key] = DvcFileSystem(repo=repo) @@ -249,7 +255,7 @@ def _is_dvc_repo(self, dir_path): from dvc.repo import Repo - repo_path = os.path.join(dir_path, Repo.DVC_DIR) + repo_path = self.repo.fs.path.join(dir_path, Repo.DVC_DIR) return self.repo.fs.isdir(repo_path) def _get_fs_pair( @@ -417,7 +423,7 @@ def info(self, path, **kwargs): if not dvc_info and not fs_info: raise FileNotFoundError - info = _merge_info(dvc_fs.repo, fs_info, dvc_info) + info = _merge_info(repo, fs_info, dvc_info) info["name"] = path return info @@ -445,6 +451,10 @@ def fs(self): def isdvc(self, path, **kwargs): return self.fs.isdvc(path, **kwargs) + @property + def path(self): # pylint: disable=invalid-overridden-method + return self.fs.path + @property def repo(self): return self.fs.repo diff --git a/dvc/ignore.py b/dvc/ignore.py index 146098345b..e035f5eb7e 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -35,7 +35,6 @@ def __init__(self, pattern_list, dirname, sep): self.sep = sep self.pattern_list = pattern_list self.dirname = dirname - self.prefix = self.dirname + self.sep self.regex_pattern_list = [ GitWildMatchPattern.pattern_to_regex(pattern_info.patterns) @@ -74,10 +73,13 @@ def __call__(self, root: List[str], dirs: List[str], files: List[str]): def _get_normalize_path(self, dirname, basename): # NOTE: `relpath` is too slow, so we have to assume that both # `dirname` and `self.dirname` are relative or absolute together. + + prefix = self.dirname.rstrip(self.sep) + self.sep + if dirname == self.dirname: path = basename - elif dirname.startswith(self.prefix): - rel = dirname[len(self.prefix) :] + elif dirname.startswith(prefix): + rel = dirname[len(prefix) :] # NOTE: `os.path.join` is ~x5.5 slower path = f"{rel}{self.sep}{basename}" else: @@ -210,6 +212,7 @@ def _update_trie(self, dirname: str, trie: Trie) -> None: new_pattern = DvcIgnorePatterns.from_file(path, self.fs, name) if old_pattern: plist, prefix = merge_patterns( + self.fs.path.flavour, old_pattern.pattern_list, old_pattern.dirname, new_pattern.pattern_list, @@ -259,6 +262,7 @@ def _update_sub_repo(self, path, ignore_trie: Trie): old_pattern = ignore_trie.longest_prefix(key).value if old_pattern: plist, prefix = merge_patterns( + self.fs.path.flavour, old_pattern.pattern_list, old_pattern.dirname, new_pattern.pattern_list, diff --git a/dvc/output.py b/dvc/output.py index 0535fbf3fe..1fe0a22b02 100644 --- a/dvc/output.py +++ b/dvc/output.py @@ -295,12 +295,21 @@ def __init__( if ( self.fs.scheme == "local" and stage + and isinstance(stage.repo.fs, LocalFileSystem) and path_isin(path, stage.repo.root_dir) ): self.def_path = relpath(path, stage.wdir) + self.fs = stage.repo.fs else: self.def_path = path + if ( + self.repo + and self.fs.scheme == "local" + and not self.fs.path.isabs(self.def_path) + ): + self.fs = self.repo.fs + self._validate_output_path(path, stage) # This output (and dependency) objects have too many paths/urls # here is a list and comments: @@ -342,7 +351,7 @@ def _parse_path(self, fs, fs_path): if self.stage and not os.path.isabs(fs_path): fs_path = fs.path.join(self.stage.wdir, fs_path) - abs_p = os.path.abspath(os.path.normpath(fs_path)) + abs_p = fs.path.abspath(fs.path.normpath(fs_path)) return abs_p def __repr__(self): @@ -361,11 +370,11 @@ def __str__(self): ): return str(self.def_path) - cur_dir = os.getcwd() - if path_isin(cur_dir, self.repo.root_dir): - return relpath(self.fs_path, cur_dir) + cur_dir = self.fs.path.getcwd() + if self.fs.path.isin(cur_dir, self.repo.root_dir): + return self.fs.path.relpath(self.fs_path, cur_dir) - return relpath(self.fs_path, self.repo.root_dir) + return self.fs.path.relpath(self.fs_path, self.repo.root_dir) @property def scheme(self): @@ -379,11 +388,12 @@ def is_in_repo(self): if urlparse(self.def_path).scheme == "remote": return False - if os.path.isabs(self.def_path): + if self.fs.path.isabs(self.def_path): return False - return self.repo and path_isin( - os.path.realpath(self.fs_path), self.repo.root_dir + return self.repo and self.fs.path.isin( + self.fs.path.realpath(self.fs_path), + self.repo.root_dir, ) @property diff --git a/dvc/parsing/__init__.py b/dvc/parsing/__init__.py index 7efa4616cd..041863a55c 100644 --- a/dvc/parsing/__init__.py +++ b/dvc/parsing/__init__.py @@ -18,7 +18,6 @@ from dvc.exceptions import DvcException from dvc.parsing.interpolate import ParseError -from dvc.utils import relpath from .context import ( Context, @@ -135,19 +134,14 @@ def make_definition( class DataResolver: def __init__(self, repo: "Repo", wdir: str, d: dict): - from dvc.fs import LocalFileSystem - self.fs = fs = repo.fs if os.path.isabs(wdir): - start = ( - os.curdir if isinstance(fs, LocalFileSystem) else repo.root_dir - ) - wdir = relpath(wdir, start) + wdir = fs.path.relpath(wdir) wdir = "" if wdir == os.curdir else wdir self.wdir = wdir - self.relpath = os.path.normpath(fs.path.join(self.wdir, "dvc.yaml")) + self.relpath = fs.path.normpath(fs.path.join(self.wdir, "dvc.yaml")) vars_ = d.get(VARS_KWD, []) check_interpolations(vars_, VARS_KWD, self.relpath) diff --git a/dvc/parsing/context.py b/dvc/parsing/context.py index 98db8852f5..899ee85646 100644 --- a/dvc/parsing/context.py +++ b/dvc/parsing/context.py @@ -395,7 +395,7 @@ def merge_update(self, other: "Context", overwrite=False): def merge_from(self, fs, item: str, wdir: str, overwrite=False): path, _, keys_str = item.partition(":") - path = os.path.normpath(fs.path.join(wdir, path)) + path = fs.path.normpath(fs.path.join(wdir, path)) select_keys = lfilter(bool, keys_str.split(",")) if keys_str else None if path in self.imports: diff --git a/dvc/pathspec_math.py b/dvc/pathspec_math.py index 3a1fcbd22f..b0541176e7 100644 --- a/dvc/pathspec_math.py +++ b/dvc/pathspec_math.py @@ -2,7 +2,6 @@ # Including changing base dir of path specification patterns and merging # of two path specification patterns with different base # All the operations follow the documents of `gitignore` -import os from collections import namedtuple from pathspec.util import normalize_file @@ -68,7 +67,7 @@ def _change_dirname(dirname, pattern_list, new_dirname): ] -def merge_patterns(pattern_a, prefix_a, pattern_b, prefix_b): +def merge_patterns(flavour, pattern_a, prefix_a, pattern_b, prefix_b): """ Merge two path specification patterns. @@ -81,7 +80,7 @@ def merge_patterns(pattern_a, prefix_a, pattern_b, prefix_b): if not pattern_b: return pattern_a, prefix_a - longest_common_dir = os.path.commonpath([prefix_a, prefix_b]) + longest_common_dir = flavour.commonpath([prefix_a, prefix_b]) new_pattern_a = _change_dirname(prefix_a, pattern_a, longest_common_dir) new_pattern_b = _change_dirname(prefix_b, pattern_b, longest_common_dir) diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index da126193d9..47ec9b48eb 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -18,6 +18,7 @@ from dvc.fs.base import FileSystem from dvc.objects.file import HashFile from dvc.repo.scm_context import SCMContext + from dvc.scm import Base logger = logging.getLogger(__name__) @@ -85,29 +86,32 @@ def _get_repo_dirs( root_dir: str = None, fs: "FileSystem" = None, uninitialized: bool = False, + scm: "Base" = None, ): - from dvc.scm import SCM, Base, SCMError - from dvc.utils.fs import makedirs + from dvc.fs.local import localfs + from dvc.scm import SCM, SCMError dvc_dir = None tmp_dir = None try: root_dir = self.find_root(root_dir, fs) - dvc_dir = os.path.join(root_dir, self.DVC_DIR) - tmp_dir = os.path.join(dvc_dir, "tmp") - makedirs(tmp_dir, exist_ok=True) + fs = fs or localfs + dvc_dir = fs.path.join(root_dir, self.DVC_DIR) + tmp_dir = fs.path.join(dvc_dir, "tmp") except NotDvcRepoError: if not uninitialized: raise - try: - scm = SCM(root_dir or os.curdir) - except SCMError: - scm = SCM(os.curdir, no_scm=True) + if not scm: + try: + scm = SCM(root_dir or os.curdir) + except SCMError: + scm = SCM(os.curdir, no_scm=True) - assert isinstance(scm, Base) - root_dir = scm.root_dir + if not fs or not root_dir: + root_dir = scm.root_dir + assert root_dir return root_dir, dvc_dir, tmp_dir def _get_database_dir(self, db_name): @@ -148,6 +152,7 @@ def __init__( config=None, url=None, repo_factory=None, + scm=None, ): from dvc.config import Config from dvc.data.db import ODBManager @@ -166,14 +171,18 @@ def __init__( self.url = url self._fs_conf = {"repo_factory": repo_factory} self._fs = fs or localfs - self._scm = None + self._scm = scm if rev and not fs: - self._scm = SCM(root_dir or os.curdir) + self._scm = scm = SCM(root_dir or os.curdir) + root_dir = "/" self._fs = GitFileSystem(scm=self._scm, rev=rev) self.root_dir, self.dvc_dir, self.tmp_dir = self._get_repo_dirs( - root_dir=root_dir, fs=self.fs, uninitialized=uninitialized + root_dir=root_dir, + fs=self.fs, + uninitialized=uninitialized, + scm=scm, ) self.config = Config(self.dvc_dir, fs=self.fs, config=config) @@ -189,7 +198,11 @@ def __init__( self.lock = LockNoop() self.state = StateNoop() self.odb = ODBManager(self) + self.tmp_dir = None else: + from dvc.utils.fs import makedirs + + makedirs(self.tmp_dir, exist_ok=True) self.lock = make_lock( os.path.join(self.tmp_dir, "lock"), tmp_dir=self.tmp_dir, @@ -312,9 +325,9 @@ def __repr__(self): def find_root(cls, root=None, fs=None) -> str: from dvc.fs.local import LocalFileSystem, localfs - root = root or os.curdir - root_dir = os.path.realpath(root) fs = fs or localfs + root = root or os.curdir + root_dir = fs.path.realpath(root) if not fs.isdir(root_dir): raise NotDvcRepoError(f"directory '{root}' does not exist") @@ -443,7 +456,7 @@ def find_outs_by_path(self, path, outs=None, recursive=False, strict=True): # using `outs_graph` to ensure graph checks are run outs = outs or self.index.outs_graph - abs_path = os.path.abspath(path) + abs_path = self.fs.path.abspath(path) fs_path = abs_path def func(out): @@ -467,7 +480,7 @@ def eq(one, two): return matched def is_dvc_internal(self, path): - path_parts = os.path.normpath(path).split(os.path.sep) + path_parts = self.fs.path.normpath(path).split(self.fs.sep) return self.DVC_DIR in path_parts @cached_property diff --git a/dvc/repo/brancher.py b/dvc/repo/brancher.py index 30e6d758c2..f678855a41 100644 --- a/dvc/repo/brancher.py +++ b/dvc/repo/brancher.py @@ -31,7 +31,18 @@ def brancher( # noqa: E302 from dvc.fs.local import LocalFileSystem + repo_root_parts = () + if self.fs.path.isin(self.root_dir, self.scm.root_dir): + repo_root_parts = self.fs.path.relparts( + self.root_dir, self.scm.root_dir + ) + + cwd_parts = () + if self.fs.path.isin(self.fs.path.getcwd(), self.root_dir): + cwd_parts = self.fs.path.relparts(self.fs.path.getcwd(), self.root_dir) + saved_fs = self.fs + saved_root = self.root_dir scm = self.scm @@ -56,9 +67,16 @@ def brancher( # noqa: E302 for sha, names in found_revs.items(): self.fs = GitFileSystem(scm=scm, rev=sha) + self.root_dir = self.fs.path.join("/", *repo_root_parts) + + if cwd_parts: + cwd = self.fs.path.join("/", *cwd_parts) + self.fs.path.chdir(cwd) + # ignore revs that don't contain repo root # (i.e. revs from before a subdir=True repo was init'ed) if self.fs.exists(self.root_dir): yield sha if sha_only else ",".join(names) finally: self.fs = saved_fs + self.root_dir = saved_root diff --git a/dvc/repo/collect.py b/dvc/repo/collect.py index bc74dfd323..3f29852b16 100644 --- a/dvc/repo/collect.py +++ b/dvc/repo/collect.py @@ -31,7 +31,6 @@ def _collect_paths( rev: str = None, ) -> StrPaths: from dvc.fs.repo import RepoFileSystem - from dvc.utils import relpath fs = RepoFileSystem(repo=repo) fs_paths = [fs.from_os_path(target) for target in targets] @@ -41,7 +40,7 @@ def _collect_paths( if recursive and fs.isdir(fs_path): target_paths.extend(fs.find(fs_path)) - rel = relpath(fs_path) + rel = fs.path.relpath(fs_path) if not fs.exists(fs_path): if rev == "workspace" or rev == "": logger.warning("'%s' was not found in current workspace.", rel) diff --git a/dvc/repo/diff.py b/dvc/repo/diff.py index 176b05903d..df0c445efb 100644 --- a/dvc/repo/diff.py +++ b/dvc/repo/diff.py @@ -122,11 +122,11 @@ def _exists(output): return True def _to_path(output): - return ( - str(output) - if not output.is_dir_checksum - else os.path.join(str(output), "") - ) + relparts = output.fs.path.relparts(output.fs_path) + base = os.path.join(*relparts) + if output.is_dir_checksum: + return os.path.join(base, "") + return base for output in repo.index.outs: if _exists(output): @@ -168,20 +168,14 @@ def _to_path(output): def _dir_output_paths(fs, fs_path, obj, targets=None): - if fs.scheme == "local": - # NOTE: workaround for filesystems that are based on full local paths - # (e.g. gitfs, dvcfs, repofs). Proper solution is to use upcoming - # fsspec's prefixfs to use relpaths as fs_paths. - base = fs.path.relpath(fs_path, os.getcwd()) - else: - base = fs_path + base = os.path.join(*fs.path.relparts(fs_path)) for key, _, oid in obj: fname = fs.path.join(fs_path, *key) if targets is None or any( fs.path.isin_or_eq(fname, target) for target in targets ): # pylint: disable=no-member - yield fs.path.join(base, *key), oid.value + yield os.path.join(base, *key), oid.value def _filter_missing(repo_fs, paths): @@ -206,7 +200,7 @@ def _targets_to_paths(repo_fs, targets): for target in targets: if repo_fs.exists(target): - paths.append(os.path.abspath(target)) + paths.append(repo_fs.repo.fs.path.abspath(target)) else: missing.append(target) diff --git a/dvc/repo/params/show.py b/dvc/repo/params/show.py index 15d906a3b4..677e33a684 100644 --- a/dvc/repo/params/show.py +++ b/dvc/repo/params/show.py @@ -10,12 +10,7 @@ from dvc.scm import NoSCMError from dvc.stage import PipelineStage from dvc.ui import ui -from dvc.utils import ( - error_handler, - errored_revisions, - onerror_collect, - relpath, -) +from dvc.utils import error_handler, errored_revisions, onerror_collect from dvc.utils.serialize import LOADERS if TYPE_CHECKING: @@ -42,7 +37,7 @@ def _collect_configs( ) all_fs_paths = fs_paths + [p.fs_path for p in params] if not targets: - default_params = os.path.join( + default_params = repo.fs.path.join( repo.root_dir, ParamsDependency.DEFAULT_PARAMS_FILE ) if default_params not in all_fs_paths and repo.fs.exists( @@ -75,41 +70,33 @@ def _read_params( onerror=onerror, flatten=False ) if params_dict: - res[ - repo.fs.path.relpath(param.fs_path, os.getcwd()) - ] = params_dict + name = os.sep.join(repo.fs.path.relparts(param.fs_path)) + res[name] = params_dict else: fs_paths += [param.fs_path for param in params] for fs_path in fs_paths: from_path = _read_fs_path(repo.fs, fs_path, onerror=onerror) if from_path: - res[repo.fs.path.relpath(fs_path, os.getcwd())] = from_path + name = os.sep.join(repo.fs.path.relparts(fs_path)) + res[name] = from_path return res def _collect_vars(repo, params) -> Dict: - from dvc.fs.git import GitFileSystem - vars_params: Dict[str, Dict] = defaultdict(dict) - rel_to_root = relpath(repo.root_dir) for stage in repo.index.stages: if isinstance(stage, PipelineStage) and stage.tracked_vars: for file, vars_ in stage.tracked_vars.items(): - if isinstance(repo.fs, GitFileSystem): - # GitFileSystem uses relatively-absolute paths from the - # root of the repo. We need to convert them to relative - # paths based on the current working directory. - file = os.path.normpath(os.path.join(rel_to_root, file)) - # `params` file are shown regardless of `tracked` or not # to reduce noise and duplication, they are skipped if file in params: continue - vars_params[file].update(vars_) + name = os.sep.join(repo.fs.path.parts(file)) + vars_params[name].update(vars_) return vars_params diff --git a/dvc/repo/plots/__init__.py b/dvc/repo/plots/__init__.py index f99c71166f..f8354b4186 100644 --- a/dvc/repo/plots/__init__.py +++ b/dvc/repo/plots/__init__.py @@ -17,12 +17,7 @@ from funcy import cached_property, first, project from dvc.exceptions import DvcException -from dvc.utils import ( - error_handler, - errored_revisions, - onerror_collect, - relpath, -) +from dvc.utils import error_handler, errored_revisions, onerror_collect from dvc.utils.serialize import LOADERS if TYPE_CHECKING: @@ -110,6 +105,7 @@ def _collect_from_revision( plots = _collect_plots(self.repo, targets, revision, recursive) res: Dict[str, Any] = {} for fs_path, rev_props in plots.items(): + base = os.path.join(*fs.path.relparts(fs_path, fs.fs.root_marker)) if fs.isdir(fs_path): plot_files = [] unpacking_res = _unpack_dir_files(fs, fs_path, onerror=onerror) @@ -118,12 +114,17 @@ def _collect_from_revision( "data" ): plot_files.append( - (pi, relpath(pi, self.repo.root_dir)) + ( + pi, + os.path.join( + base, *fs.path.relparts(pi, fs_path) + ), + ) ) else: - res[relpath(fs_path, self.repo.root_dir)] = unpacking_res + res[base] = unpacking_res else: - plot_files = [(fs_path, relpath(fs_path, self.repo.root_dir))] + plot_files = [(fs_path, base)] props = props or {} diff --git a/dvc/repo/stage.py b/dvc/repo/stage.py index ad19af396d..ffd8d9c7e1 100644 --- a/dvc/repo/stage.py +++ b/dvc/repo/stage.py @@ -1,6 +1,5 @@ import fnmatch import logging -import os import time import typing from contextlib import suppress @@ -24,7 +23,7 @@ OutputNotFoundError, ) from dvc.repo import lock_repo -from dvc.utils import parse_target, relpath +from dvc.utils import as_posix, parse_target, relpath logger = logging.getLogger(__name__) @@ -378,7 +377,7 @@ def collect( if recursive and self.fs.isdir(target): from dvc.repo.graph import collect_inside_path - path = os.path.abspath(target) + path = self.fs.path.abspath(target) return collect_inside_path(path, graph or self.graph) stages = self.from_target(target, accept_group=accept_group, glob=glob) @@ -416,6 +415,8 @@ def collect_granular( if not target: return [StageInfo(stage) for stage in self.repo.index] + target = as_posix(target) + stages, file, _ = _collect_specific_target( self, target, with_deps, recursive, accept_group ) @@ -423,7 +424,7 @@ def collect_granular( if not (recursive and self.fs.isdir(target)): try: (out,) = self.repo.find_outs_by_path(target, strict=False) - return [StageInfo(out.stage, os.path.abspath(target))] + return [StageInfo(out.stage, self.fs.path.abspath(target))] except OutputNotFoundError: pass @@ -467,7 +468,7 @@ def _collect_repo(self, onerror: Callable[[str, Exception], None] = None): from dvc.fs.local import LocalFileSystem scm = self.repo.scm - sep = os.sep + sep = self.fs.sep outs: Set[str] = set() is_local_fs = isinstance(self.fs, LocalFileSystem) @@ -493,7 +494,7 @@ def is_out_or_ignored(root, directory): for root, dirs, files in walk_iter: dvcfile_filter = partial(is_dvcfile_and_not_ignored, root) for file in filter(dvcfile_filter, files): - file_path = os.path.join(root, file) + file_path = self.fs.path.join(root, file) try: new_stages = self.load_file(file_path) except DvcException as exc: diff --git a/dvc/stage/loader.py b/dvc/stage/loader.py index 810389e0ad..8dbae672d0 100644 --- a/dvc/stage/loader.py +++ b/dvc/stage/loader.py @@ -77,7 +77,7 @@ def load_stage(cls, dvcfile, name, stage_data, lock_data=None): assert stage_data and isinstance(stage_data, dict) path, wdir = resolve_paths( - dvcfile.path, stage_data.get(Stage.PARAM_WDIR) + dvcfile.repo.fs, dvcfile.path, stage_data.get(Stage.PARAM_WDIR) ) stage = loads_from(PipelineStage, dvcfile.repo, path, wdir, stage_data) stage.name = name @@ -184,7 +184,9 @@ def __getitem__(self, item): @classmethod def load_stage(cls, dvcfile, d, stage_text): - path, wdir = resolve_paths(dvcfile.path, d.get(Stage.PARAM_WDIR)) + path, wdir = resolve_paths( + dvcfile.repo.fs, dvcfile.path, d.get(Stage.PARAM_WDIR) + ) stage = loads_from(Stage, dvcfile.repo, path, wdir, d) stage._stage_text = stage_text # noqa, pylint:disable=protected-access stage.deps = dependency.loadd_from( diff --git a/dvc/stage/utils.py b/dvc/stage/utils.py index 8472a84c11..2a5c2875ae 100644 --- a/dvc/stage/utils.py +++ b/dvc/stage/utils.py @@ -210,10 +210,10 @@ def resolve_wdir(wdir, path): return pathlib.PurePath(rel_wdir).as_posix() if rel_wdir != "." else None -def resolve_paths(path, wdir=None): - path = os.path.abspath(path) +def resolve_paths(fs, path, wdir=None): + path = fs.path.abspath(path) wdir = wdir or os.curdir - wdir = os.path.abspath(os.path.join(os.path.dirname(path), wdir)) + wdir = fs.path.abspath(fs.path.join(fs.path.dirname(path), wdir)) return path, wdir diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index 81157ed997..add978e055 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -332,7 +332,7 @@ def resolve_network_drive_windows(path_to_resolve): return os.path.relpath(path, start) -def as_posix(path): +def as_posix(path: str) -> str: import ntpath import posixpath diff --git a/setup.cfg b/setup.cfg index 7ad18d4309..0ab6fa9a1b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -72,7 +72,7 @@ install_requires = aiohttp-retry>=2.4.5 diskcache>=5.2.1 jaraco.windows>=5.7.0; python_version < '3.8' and sys_platform == 'win32' - scmrepo==0.0.19 + scmrepo==0.0.22 dvc-render==0.0.5 dvclive>=0.7.3 diff --git a/tests/func/experiments/test_experiments.py b/tests/func/experiments/test_experiments.py index 082c5ca08e..08d1a80635 100644 --- a/tests/func/experiments/test_experiments.py +++ b/tests/func/experiments/test_experiments.py @@ -33,7 +33,7 @@ def test_new_simple(tmp_dir, scm, dvc, exp_stage, mocker, name, workspace): new_mock.assert_called_once() fs = scm.get_fs(exp) - with fs.open(tmp_dir / "metrics.yaml", mode="r", encoding="utf-8") as fobj: + with fs.open("metrics.yaml", mode="r", encoding="utf-8") as fobj: assert fobj.read().strip() == "foo: 2" if workspace: @@ -75,7 +75,7 @@ def test_experiment_exists(tmp_dir, scm, dvc, exp_stage, mocker, workspace): exp = first(results) fs = scm.get_fs(exp) - with fs.open(tmp_dir / "metrics.yaml", mode="r", encoding="utf-8") as fobj: + with fs.open("metrics.yaml", mode="r", encoding="utf-8") as fobj: assert fobj.read().strip() == "foo: 3" @@ -145,7 +145,7 @@ def test_modify_params(tmp_dir, scm, dvc, mocker, changes, expected): new_mock.assert_called_once() fs = scm.get_fs(exp) - with fs.open(tmp_dir / "metrics.yaml", mode="r") as fobj: + with fs.open("metrics.yaml", mode="r") as fobj: assert fobj.read().strip() == expected @@ -263,9 +263,9 @@ def test_update_py_params(tmp_dir, scm, dvc): exp_a = first(results) fs = scm.get_fs(exp_a) - with fs.open(tmp_dir / "params.py", mode="r", encoding="utf-8") as fobj: + with fs.open("params.py", mode="r", encoding="utf-8") as fobj: assert fobj.read().strip() == "INT = 2" - with fs.open(tmp_dir / "metrics.py", mode="r", encoding="utf-8") as fobj: + with fs.open("metrics.py", mode="r", encoding="utf-8") as fobj: assert fobj.read().strip() == "INT = 2" tmp_dir.gen( @@ -309,9 +309,9 @@ def _dos2unix(text): return text.replace("\r\n", "\n") fs = scm.get_fs(exp_a) - with fs.open(tmp_dir / "params.py", mode="r", encoding="utf-8") as fobj: + with fs.open("params.py", mode="r", encoding="utf-8") as fobj: assert _dos2unix(fobj.read().strip()) == result - with fs.open(tmp_dir / "metrics.py", mode="r", encoding="utf-8") as fobj: + with fs.open("metrics.py", mode="r", encoding="utf-8") as fobj: assert _dos2unix(fobj.read().strip()) == result tmp_dir.gen("params.py", "INT = 1\n") @@ -438,7 +438,7 @@ def test_untracked(tmp_dir, scm, dvc, caplog, workspace): assert fs.exists("dvc.yaml") assert fs.exists("dvc.lock") assert fs.exists("copy.py") - with fs.open(tmp_dir / "metrics.yaml", mode="r", encoding="utf-8") as fobj: + with fs.open("metrics.yaml", mode="r", encoding="utf-8") as fobj: assert fobj.read().strip() == "foo: 2" @@ -515,8 +515,8 @@ def test_subdir(tmp_dir, scm, dvc, workspace): fs = scm.get_fs(exp) for fname in ["metrics.yaml", "dvc.lock"]: - assert fs.exists(subdir / fname) - with fs.open(subdir / "metrics.yaml", mode="r", encoding="utf-8") as fobj: + assert fs.exists(f"dir/{fname}") + with fs.open("dir/metrics.yaml", mode="r", encoding="utf-8") as fobj: assert fobj.read().strip() == "foo: 2" assert dvc.experiments.get_exact_name(exp) == ref_info.name @@ -560,8 +560,8 @@ def test_subrepo(tmp_dir, scm, workspace): fs = scm.get_fs(exp) for fname in ["metrics.yaml", "dvc.lock"]: - assert fs.exists(subrepo / fname) - with fs.open(subrepo / "metrics.yaml", mode="r", encoding="utf-8") as fobj: + assert fs.exists(f"dir/repo/{fname}") + with fs.open("dir/repo/metrics.yaml", mode="r", encoding="utf-8") as fobj: assert fobj.read().strip() == "foo: 2" assert subrepo.dvc.experiments.get_exact_name(exp) == ref_info.name @@ -582,9 +582,7 @@ def test_queue(tmp_dir, scm, dvc, exp_stage, mocker): metrics = set() for exp in results: fs = scm.get_fs(exp) - with fs.open( - tmp_dir / "metrics.yaml", mode="r", encoding="utf-8" - ) as fobj: + with fs.open("metrics.yaml", mode="r", encoding="utf-8") as fobj: metrics.add(fobj.read().strip()) assert expected == metrics diff --git a/tests/func/parsing/test_resolver.py b/tests/func/parsing/test_resolver.py index a2ac0c42b8..6bb733e395 100644 --- a/tests/func/parsing/test_resolver.py +++ b/tests/func/parsing/test_resolver.py @@ -1,4 +1,3 @@ -import os from copy import deepcopy import pytest @@ -71,13 +70,13 @@ def test_load_vars_from_file(tmp_dir, dvc): def test_load_vars_with_relpath(tmp_dir, scm, dvc): tmp_dir.scm_gen(DEFAULT_PARAMS_FILE, dumps_yaml(DATA), commit="add params") - subdir = tmp_dir / "subdir" - d = {"vars": [os.path.relpath(tmp_dir / DEFAULT_PARAMS_FILE, subdir)]} - revisions = ["HEAD", "workspace"] for rev in dvc.brancher(revs=["HEAD"]): assert rev == revisions.pop() - resolver = DataResolver(dvc, subdir.fs_path, d) + d = { + "vars": [f"../{DEFAULT_PARAMS_FILE}"], + } + resolver = DataResolver(dvc, "subdir", d) assert resolver.context == deepcopy(DATA) diff --git a/tests/func/test_fs.py b/tests/func/test_fs.py index 90e0143b16..4b4b484689 100644 --- a/tests/func/test_fs.py +++ b/tests/func/test_fs.py @@ -198,7 +198,7 @@ def test_walk_dont_ignore_subrepos(tmp_dir, scm, dvc): get_dirs = itemgetter(1) assert set(get_dirs(next(dvc_fs.walk(path)))) == {".dvc", "subdir", ".git"} - assert set(get_dirs(next(scm_fs.walk(path)))) == {".dvc", "subdir"} + assert set(get_dirs(next(scm_fs.walk("/")))) == {".dvc", "subdir"} def test_fs_size(dvc, cloud): diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index 37b66b9193..a3c465f412 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -111,6 +111,7 @@ def test_ignore_collecting_dvcignores(tmp_dir, dvc, dname): assert ( DvcIgnorePatterns( *merge_patterns( + os.path, _to_pattern_info_list([".hg/", ".git/", ".git", ".dvc/"]), os.fspath(tmp_dir), _to_pattern_info_list([os.path.basename(dname)]), @@ -141,7 +142,8 @@ def test_ignore_on_branch(tmp_dir, scm, dvc): } dvc.fs = GitFileSystem(scm=scm, rev="branch") - assert dvc.dvcignore.is_ignored_file((tmp_dir / "foo").fs_path) + dvc.root_dir = "/" + assert dvc.dvcignore.is_ignored_file("/foo") def test_match_nested(tmp_dir, dvc): @@ -177,26 +179,6 @@ def test_ignore_external(tmp_dir, scm, dvc, tmp_path_factory): ) -def test_ignore_subrepo(tmp_dir, scm, dvc): - tmp_dir.gen({".dvcignore": "foo", "subdir": {"foo": "foo"}}) - scm.add([".dvcignore"]) - scm.commit("init parent dvcignore") - dvc._reset() - - subrepo_dir = tmp_dir / "subdir" - - result = walk_files(dvc, dvc.fs, subrepo_dir) - assert set(result) == set() - - with subrepo_dir.chdir(): - subrepo = Repo.init(subdir=True) - scm.add(str(subrepo_dir / "foo")) - scm.commit("subrepo init") - - for _ in subrepo.brancher(all_commits=True): - assert subrepo.fs.exists(subrepo_dir / "foo") - - def test_ignore_resurface_subrepo(tmp_dir, scm, dvc): tmp_dir.dvc_gen({"foo": "foo"}, commit="add foo") subrepo_dir = tmp_dir / "subdir" @@ -358,16 +340,19 @@ def test_pattern_trie_fs(tmp_dir, dvc): os.fspath(tmp_dir), ) first_pattern = merge_patterns( + os.path, *base_pattern, _to_pattern_info_list(["a", "b", "c"]), os.fspath(tmp_dir / "top" / "first"), ) second_pattern = merge_patterns( + os.path, *first_pattern, _to_pattern_info_list(["d", "e", "f"]), os.fspath(tmp_dir / "top" / "first" / "middle" / "second"), ) other_pattern = merge_patterns( + os.path, *base_pattern, _to_pattern_info_list(["1", "2", "3"]), os.fspath(tmp_dir / "other"), diff --git a/tests/func/test_used_objs.py b/tests/func/test_used_objs.py index f26410ead4..4d6c8a61dd 100644 --- a/tests/func/test_used_objs.py +++ b/tests/func/test_used_objs.py @@ -3,14 +3,6 @@ import pytest -from dvc.exceptions import NoOutputOrStageError -from dvc.stage.exceptions import StageFileDoesNotExistError - -gitfs_xfail = pytest.mark.xfail( - raises=(NoOutputOrStageError, StageFileDoesNotExistError), - reason="gitfs works on absolute paths only", -) - @pytest.mark.parametrize( "stage_wdir, cwd, target", @@ -19,23 +11,21 @@ (os.curdir, os.curdir, "train"), (os.curdir, os.curdir, "dvc.yaml:train"), (os.curdir, "sub", os.path.join(os.pardir, "foo")), - pytest.param( + ( os.curdir, "sub", os.path.join(os.pardir, "dvc.yaml:train"), - marks=gitfs_xfail, ), ("sub", os.curdir, os.path.join("sub", "foo")), ("sub", os.curdir, os.path.join("sub", "dvc.yaml:train")), ("sub", "sub", "foo"), - pytest.param("sub", "sub", "train", marks=gitfs_xfail), - pytest.param("sub", "sub", "dvc.yaml:train", marks=gitfs_xfail), + ("sub", "sub", "train"), + ("sub", "sub", "dvc.yaml:train"), ("sub", "dir", os.path.join(os.pardir, "sub", "foo")), - pytest.param( + ( "sub", "dir", os.path.join(os.pardir, "sub", "dvc.yaml:train"), - marks=gitfs_xfail, ), ], ) diff --git a/tests/unit/stage/test_utils.py b/tests/unit/stage/test_utils.py index 191013119b..1b87c0ad0c 100644 --- a/tests/unit/stage/test_utils.py +++ b/tests/unit/stage/test_utils.py @@ -1,5 +1,6 @@ import os +from dvc.fs.local import localfs from dvc.stage.utils import resolve_paths @@ -7,14 +8,16 @@ def test_resolve_paths(): p = os.path.join("dir", "subdir") file_path = os.path.join(p, "dvc.yaml") - path, wdir = resolve_paths(path=file_path, wdir="dir") + path, wdir = resolve_paths(fs=localfs, path=file_path, wdir="dir") assert path == os.path.abspath(file_path) assert wdir == os.path.abspath(os.path.join(p, "dir")) - path, wdir = resolve_paths(path=file_path) + path, wdir = resolve_paths(fs=localfs, path=file_path) assert path == os.path.abspath(file_path) assert wdir == os.path.abspath(p) - path, wdir = resolve_paths(path=file_path, wdir="../../some-dir") + path, wdir = resolve_paths( + fs=localfs, path=file_path, wdir="../../some-dir" + ) assert path == os.path.abspath(file_path) assert wdir == os.path.abspath("some-dir") diff --git a/tests/unit/test_external_repo.py b/tests/unit/test_external_repo.py index 4d83495d5b..d3a2bc6633 100644 --- a/tests/unit/test_external_repo.py +++ b/tests/unit/test_external_repo.py @@ -31,12 +31,13 @@ def test_hook_is_called(tmp_dir, erepo_dir, mocker): list(repo.repo_fs.walk("", ignore_subrepos=False)) # drain assert spy.call_count == len(subrepos) - paths = [os.path.join(repo.root_dir, path) for path in subrepo_paths] + paths = ["/" + path.replace("\\", "/") for path in subrepo_paths] spy.assert_has_calls( [ call( path, fs=repo.fs, + scm=repo.scm, repo_factory=repo.repo_fs.fs.repo_factory, ) for path in paths diff --git a/tests/unit/test_ignore.py b/tests/unit/test_ignore.py index 1fd9365007..3a6be60ef7 100644 --- a/tests/unit/test_ignore.py +++ b/tests/unit/test_ignore.py @@ -11,6 +11,7 @@ def mock_dvcignore(dvcignore_path, patterns): fs = MagicMock() fs.path = localfs.path + fs.sep = localfs.sep with patch.object(fs, "open", mock_open(read_data="\n".join(patterns))): ignore_patterns = DvcIgnorePatterns.from_file( dvcignore_path, fs, "mocked"