Skip to content

Commit

Permalink
dvc: detach gitfs from local paths
Browse files Browse the repository at this point in the history
Our gitfs paths are currently tied to the local path in which the cloned
repository resides in, which results in a lot of hacky assumptions throughout
the code.

This PR detaches gitfs from local paths completely, making dvc work with a
proper fsspec gitfs implementation.

Part of #7543
  • Loading branch information
efiop committed May 11, 2022
1 parent 14c9348 commit a80a85e
Show file tree
Hide file tree
Showing 30 changed files with 208 additions and 192 deletions.
11 changes: 5 additions & 6 deletions dvc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ def __init__(
from dvc.fs.local import LocalFileSystem

self.dvc_dir = dvc_dir
self.wfs = LocalFileSystem()
self.fs = fs or self.wfs

if not dvc_dir:
try:
Expand All @@ -98,10 +100,7 @@ def __init__(
except NotDvcRepoError:
self.dvc_dir = None
else:
self.dvc_dir = os.path.abspath(os.path.realpath(dvc_dir))

self.wfs = LocalFileSystem(url=self.dvc_dir)
self.fs = fs or self.wfs
self.dvc_dir = self.fs.path.abspath(self.fs.path.realpath(dvc_dir))

self.load(validate=validate, config=config)

Expand All @@ -124,8 +123,8 @@ def files(self):
}

if self.dvc_dir is not None:
files["repo"] = os.path.join(self.dvc_dir, self.CONFIG)
files["local"] = os.path.join(self.dvc_dir, self.CONFIG_LOCAL)
files["repo"] = self.fs.path.join(self.dvc_dir, self.CONFIG)
files["local"] = self.fs.path.join(self.dvc_dir, self.CONFIG_LOCAL)

return files

Expand Down
20 changes: 12 additions & 8 deletions dvc/external_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,19 +50,21 @@ def external_repo(
config.update(cache_config)

if for_write:
scm = None
root_dir = path
fs = None
else:
root_dir = os.path.realpath(path)
scm = Git(root_dir)
scm = Git(path)
fs = GitFileSystem(scm=scm, rev=rev)
root_dir = "/"

repo_kwargs = dict(
root_dir=root_dir,
url=url,
fs=fs,
config=config,
repo_factory=erepo_factory(url, cache_config),
repo_factory=erepo_factory(url, root_dir, cache_config),
scm=scm,
**kwargs,
)

Expand Down Expand Up @@ -92,14 +94,16 @@ def external_repo(
_remove(path)


def erepo_factory(url, cache_config, *args, **kwargs):
def make_repo(path, **_kwargs):
def erepo_factory(url, root_dir, cache_config):
from dvc.fs.local import localfs

def make_repo(path, fs=None, **_kwargs):
_config = cache_config.copy()
if os.path.isdir(url):
rel = os.path.relpath(path, _kwargs["fs"].fs_args["scm"].root_dir)
repo_path = os.path.join(url, rel)
fs = fs or localfs
repo_path = os.path.join(url, *fs.path.relparts(path, root_dir))
_config.update(_get_remote_config(repo_path))
return Repo(path, config=_config, **_kwargs)
return Repo(path, fs=fs, config=_config, **_kwargs)

return make_repo

Expand Down
12 changes: 4 additions & 8 deletions dvc/fs/git.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import os
import threading
from typing import TYPE_CHECKING, Any

Expand All @@ -15,7 +14,6 @@
class GitFileSystem(FileSystem): # pylint:disable=abstract-method
"""Proxies the repo file access methods to Git objects"""

sep = os.sep
scheme = "local"

def __init__(
Expand All @@ -40,19 +38,17 @@ def __init__(
}
)

@cached_property
def path(self):
from .path import Path

return Path(self.sep, getcwd=os.getcwd)

@wrap_prop(threading.Lock())
@cached_property
def fs(self) -> "FsspecGitFileSystem":
from scmrepo.fs import GitFileSystem as FsspecGitFileSystem

return FsspecGitFileSystem(**self.fs_args)

@cached_property
def path(self):
return self.fs.path

@property
def rev(self) -> str:
return self.fs.rev
Expand Down
2 changes: 1 addition & 1 deletion dvc/fs/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ def fs(self):
def path(self):
from .path import Path

return Path(self.sep, os.getcwd)
return Path(self.sep, getcwd=os.getcwd, realpath=os.path.realpath)

def upload_fobj(self, fobj, to_info, **kwargs):
self.makedirs(self.path.parent(to_info))
Expand Down
18 changes: 13 additions & 5 deletions dvc/fs/path.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,26 @@


class Path:
def __init__(self, sep, getcwd=None):
def __init__(self, sep, getcwd=None, realpath=None):
def _getcwd():
return ""

self.getcwd = getcwd or _getcwd
self.realpath = realpath or self.abspath

if sep == posixpath.sep:
self.flavour = posixpath
elif sep == ntpath.sep:
self.flavour = ntpath
else:
raise ValueError(f"unsupported separator '{sep}'")

def chdir(self, path):
def _getcwd():
return path

self.getcwd = _getcwd

def join(self, *parts):
return self.flavour.join(*parts)

Expand Down Expand Up @@ -104,15 +112,15 @@ def overlaps(self, left, right):
# pylint: disable=arguments-out-of-order
return self.isin_or_eq(left, right) or self.isin(right, left)

def relpath(self, path, start):
if not start:
def relpath(self, path, start=None):
if start is None:
start = "."
return self.flavour.relpath(
self.abspath(path), start=self.abspath(start)
)

def relparts(self, path, base):
return self.parts(self.relpath(path, base))
def relparts(self, path, start=None):
return self.parts(self.relpath(path, start=start))

def as_posix(self, path):
return path.replace(self.flavour.sep, posixpath.sep)
18 changes: 14 additions & 4 deletions dvc/fs/repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,12 @@ def __init__(
self.repo_factory = repo_factory

def _getcwd():
return self.root_marker
relparts = ()
if repo.fs.path.isin(repo.fs.path.getcwd(), repo.root_dir):
relparts = repo.fs.path.relparts(
repo.fs.path.getcwd(), repo.root_dir
)
return self.root_marker + self.sep.join(relparts)

self.path = Path(self.sep, getcwd=_getcwd)
self.repo = repo
Expand Down Expand Up @@ -190,7 +195,7 @@ def _open(*args, **kwargs):
repo_kwargs["config"] = cache_config
else:
repo_kwargs["cache_dir"] = cache_dir
factory = erepo_factory(url, cache_config)
factory = erepo_factory(url, root, cache_config)

with _open(
url if url else root,
Expand Down Expand Up @@ -237,6 +242,7 @@ def _update(self, dirs, starting_repo):
repo = self.repo_factory(
d,
fs=self.repo.fs,
scm=self.repo.scm,
repo_factory=self.repo_factory,
)
self._dvcfss[key] = DvcFileSystem(repo=repo)
Expand All @@ -249,7 +255,7 @@ def _is_dvc_repo(self, dir_path):

from dvc.repo import Repo

repo_path = os.path.join(dir_path, Repo.DVC_DIR)
repo_path = self.repo.fs.path.join(dir_path, Repo.DVC_DIR)
return self.repo.fs.isdir(repo_path)

def _get_fs_pair(
Expand Down Expand Up @@ -417,7 +423,7 @@ def info(self, path, **kwargs):
if not dvc_info and not fs_info:
raise FileNotFoundError

info = _merge_info(dvc_fs.repo, fs_info, dvc_info)
info = _merge_info(repo, fs_info, dvc_info)
info["name"] = path
return info

Expand Down Expand Up @@ -445,6 +451,10 @@ def fs(self):
def isdvc(self, path, **kwargs):
return self.fs.isdvc(path, **kwargs)

@property
def path(self): # pylint: disable=invalid-overridden-method
return self.fs.path

@property
def repo(self):
return self.fs.repo
Expand Down
10 changes: 7 additions & 3 deletions dvc/ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ def __init__(self, pattern_list, dirname, sep):
self.sep = sep
self.pattern_list = pattern_list
self.dirname = dirname
self.prefix = self.dirname + self.sep

self.regex_pattern_list = [
GitWildMatchPattern.pattern_to_regex(pattern_info.patterns)
Expand Down Expand Up @@ -74,10 +73,13 @@ def __call__(self, root: List[str], dirs: List[str], files: List[str]):
def _get_normalize_path(self, dirname, basename):
# NOTE: `relpath` is too slow, so we have to assume that both
# `dirname` and `self.dirname` are relative or absolute together.

prefix = self.dirname.rstrip(self.sep) + self.sep

if dirname == self.dirname:
path = basename
elif dirname.startswith(self.prefix):
rel = dirname[len(self.prefix) :]
elif dirname.startswith(prefix):
rel = dirname[len(prefix) :]
# NOTE: `os.path.join` is ~x5.5 slower
path = f"{rel}{self.sep}{basename}"
else:
Expand Down Expand Up @@ -210,6 +212,7 @@ def _update_trie(self, dirname: str, trie: Trie) -> None:
new_pattern = DvcIgnorePatterns.from_file(path, self.fs, name)
if old_pattern:
plist, prefix = merge_patterns(
self.fs.path.flavour,
old_pattern.pattern_list,
old_pattern.dirname,
new_pattern.pattern_list,
Expand Down Expand Up @@ -259,6 +262,7 @@ def _update_sub_repo(self, path, ignore_trie: Trie):
old_pattern = ignore_trie.longest_prefix(key).value
if old_pattern:
plist, prefix = merge_patterns(
self.fs.path.flavour,
old_pattern.pattern_list,
old_pattern.dirname,
new_pattern.pattern_list,
Expand Down
26 changes: 18 additions & 8 deletions dvc/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,12 +295,21 @@ def __init__(
if (
self.fs.scheme == "local"
and stage
and isinstance(stage.repo.fs, LocalFileSystem)
and path_isin(path, stage.repo.root_dir)
):
self.def_path = relpath(path, stage.wdir)
self.fs = stage.repo.fs
else:
self.def_path = path

if (
self.repo
and self.fs.scheme == "local"
and not self.fs.path.isabs(self.def_path)
):
self.fs = self.repo.fs

self._validate_output_path(path, stage)
# This output (and dependency) objects have too many paths/urls
# here is a list and comments:
Expand Down Expand Up @@ -342,7 +351,7 @@ def _parse_path(self, fs, fs_path):
if self.stage and not os.path.isabs(fs_path):
fs_path = fs.path.join(self.stage.wdir, fs_path)

abs_p = os.path.abspath(os.path.normpath(fs_path))
abs_p = fs.path.abspath(fs.path.normpath(fs_path))
return abs_p

def __repr__(self):
Expand All @@ -361,11 +370,11 @@ def __str__(self):
):
return str(self.def_path)

cur_dir = os.getcwd()
if path_isin(cur_dir, self.repo.root_dir):
return relpath(self.fs_path, cur_dir)
cur_dir = self.fs.path.getcwd()
if self.fs.path.isin(cur_dir, self.repo.root_dir):
return self.fs.path.relpath(self.fs_path, cur_dir)

return relpath(self.fs_path, self.repo.root_dir)
return self.fs.path.relpath(self.fs_path, self.repo.root_dir)

@property
def scheme(self):
Expand All @@ -379,11 +388,12 @@ def is_in_repo(self):
if urlparse(self.def_path).scheme == "remote":
return False

if os.path.isabs(self.def_path):
if self.fs.path.isabs(self.def_path):
return False

return self.repo and path_isin(
os.path.realpath(self.fs_path), self.repo.root_dir
return self.repo and self.fs.path.isin(
self.fs.path.realpath(self.fs_path),
self.repo.root_dir,
)

@property
Expand Down
10 changes: 2 additions & 8 deletions dvc/parsing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

from dvc.exceptions import DvcException
from dvc.parsing.interpolate import ParseError
from dvc.utils import relpath

from .context import (
Context,
Expand Down Expand Up @@ -135,19 +134,14 @@ def make_definition(

class DataResolver:
def __init__(self, repo: "Repo", wdir: str, d: dict):
from dvc.fs import LocalFileSystem

self.fs = fs = repo.fs

if os.path.isabs(wdir):
start = (
os.curdir if isinstance(fs, LocalFileSystem) else repo.root_dir
)
wdir = relpath(wdir, start)
wdir = fs.path.relpath(wdir)
wdir = "" if wdir == os.curdir else wdir

self.wdir = wdir
self.relpath = os.path.normpath(fs.path.join(self.wdir, "dvc.yaml"))
self.relpath = fs.path.normpath(fs.path.join(self.wdir, "dvc.yaml"))

vars_ = d.get(VARS_KWD, [])
check_interpolations(vars_, VARS_KWD, self.relpath)
Expand Down
2 changes: 1 addition & 1 deletion dvc/parsing/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ def merge_update(self, other: "Context", overwrite=False):

def merge_from(self, fs, item: str, wdir: str, overwrite=False):
path, _, keys_str = item.partition(":")
path = os.path.normpath(fs.path.join(wdir, path))
path = fs.path.normpath(fs.path.join(wdir, path))

select_keys = lfilter(bool, keys_str.split(",")) if keys_str else None
if path in self.imports:
Expand Down
Loading

0 comments on commit a80a85e

Please sign in to comment.