Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

external: refactor external_repo() and its users #3190

Merged
merged 5 commits into from
Jan 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 15 additions & 16 deletions dvc/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from voluptuous import Schema, Required, Invalid

from dvc.repo import Repo
from dvc.exceptions import DvcException, NotDvcRepoError
from dvc.exceptions import DvcException
from dvc.external_repo import external_repo


Expand Down Expand Up @@ -60,14 +60,11 @@ def get_url(path, repo=None, rev=None, remote=None):
`repo`.
NOTE: There is no guarantee that the file actually exists in that location.
"""
try:
with _make_repo(repo, rev=rev) as _repo:
abspath = os.path.join(_repo.root_dir, path)
out, = _repo.find_outs_by_path(abspath)
remote_obj = _repo.cloud.get_remote(remote)
return str(remote_obj.checksum_to_path_info(out.checksum))
except NotDvcRepoError:
raise UrlNotDvcRepoError(repo)
with _make_repo(repo, rev=rev) as _repo:
_require_dvc(_repo)
Suor marked this conversation as resolved.
Show resolved Hide resolved
out = _repo.find_out_by_relpath(path)
remote_obj = _repo.cloud.get_remote(remote)
return str(remote_obj.checksum_to_path_info(out.checksum))


def open(path, repo=None, rev=None, remote=None, mode="r", encoding=None):
Expand Down Expand Up @@ -96,9 +93,8 @@ def __getattr__(self, name):

def _open(path, repo=None, rev=None, remote=None, mode="r", encoding=None):
with _make_repo(repo, rev=rev) as _repo:
abspath = os.path.join(_repo.root_dir, path)
with _repo.open(
abspath, remote=remote, mode=mode, encoding=encoding
with _repo.open_by_relpath(
path, remote=remote, mode=mode, encoding=encoding
) as fd:
yield fd

Expand All @@ -113,10 +109,7 @@ def read(path, repo=None, rev=None, remote=None, mode="r", encoding=None):

@contextmanager
def _make_repo(repo_url, rev=None):
if not repo_url or os.path.exists(repo_url):
assert (
rev is None
), "Git revisions are not supported for local DVC projects."
if rev is None and (not repo_url or os.path.exists(repo_url)):
yield Repo(repo_url)
else:
with external_repo(url=repo_url, rev=rev) as repo:
Comment on lines +112 to 115
Copy link
Contributor

@jorgeorpinel jorgeorpinel Mar 2, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is external_repo() used if there's a rev? What if I want to access a file in the current DVC repo but from a previous commit? I'm trying to test this but I'm getting AttributeError: module 'dvc' has no attribute 'api'

@Suor @skshetry @shcheklein

This comment was marked as resolved.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jorgeorpinel, it should just clone the current repo and checkout to given rev and access that file.

Copy link
Contributor

@jorgeorpinel jorgeorpinel Mar 2, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Testing this, I see it works anyway. But is it using external_repo() to clone the current repo in a temporary folder and then use that instead of just the current Git repo's tree?

Copy link
Contributor

@jorgeorpinel jorgeorpinel Mar 2, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

p.s. my test script:

import dvc.api

print(dvc.api.read('data', rev='HEAD^'))

data should be a DVC-tracked file with 2 different consecutive versions (in HEAD^ and in HEAD).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My point is I think Repo() also accepts rev, just wondering why it's not passed, and whether that would be a better implementation.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Repo does not accept rev. Repo is all about current workspace, so, you cannot just checkout the user's workspace and modify/read. So, even though it's still is a Repo underneath (if it's a DVC repo), it's cloned, checked out to a rev, put into a tmp cache directory and operated inside of it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. I was just confused by the name external_repo then. I guess that's the only way to get a previous rev from the current repo unfortunately.

Expand Down Expand Up @@ -149,6 +142,7 @@ def prepare_summon(name, repo=None, rev=None, summon_file="dvcsummon.yaml"):
named object specification and resolved paths to deps.
"""
with _make_repo(repo, rev=rev) as _repo:
_require_dvc(_repo)
try:
path = os.path.join(_repo.root_dir, summon_file)
obj = _get_object_spec(name, path)
Expand Down Expand Up @@ -242,3 +236,8 @@ def _import_string(import_name):
else:
return importlib.import_module(import_name)
return getattr(importlib.import_module(module), obj)


def _require_dvc(repo):
if not isinstance(repo, Repo):
raise UrlNotDvcRepoError(repo.url)
120 changes: 22 additions & 98 deletions dvc/dependency/repo.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,8 @@
import copy
import os
from contextlib import contextmanager

from funcy import merge

from .local import DependencyLOCAL
from dvc.external_repo import cached_clone
from dvc.external_repo import external_repo
from dvc.exceptions import NotDvcRepoError
from dvc.exceptions import OutputNotFoundError
from dvc.exceptions import NoOutputInExternalRepoError
from dvc.exceptions import PathMissingError
from dvc.config import NoRemoteError
from dvc.utils.fs import fs_copy
from dvc.path_info import PathInfo
from dvc.scm import SCM


class DependencyREPO(DependencyLOCAL):
Expand Down Expand Up @@ -44,34 +32,25 @@ def repo_pair(self):
def __str__(self):
return "{} ({})".format(self.def_path, self.def_repo[self.PARAM_URL])

@contextmanager
def _make_repo(self, **overrides):
with external_repo(**merge(self.def_repo, overrides)) as repo:
yield repo
def _make_repo(self, *, locked=True):
from dvc.external_repo import external_repo

def _get_checksum(self, updated=False):
rev_lock = None
if not updated:
rev_lock = self.def_repo.get(self.PARAM_REV_LOCK)
d = self.def_repo
rev = (d.get("rev_lock") if locked else None) or d.get("rev")
return external_repo(d["url"], rev=rev)

try:
with self._make_repo(rev_lock=rev_lock) as repo:
def _get_checksum(self, locked=True):
with self._make_repo(locked=locked) as repo:
try:
return repo.find_out_by_relpath(self.def_path).info["md5"]
except (NotDvcRepoError, NoOutputInExternalRepoError):
# Fall through and clone
pass

repo_path = cached_clone(
self.def_repo[self.PARAM_URL],
rev=rev_lock or self.def_repo.get(self.PARAM_REV),
)
path = PathInfo(os.path.join(repo_path, self.def_path))

return self.repo.cache.local.get_checksum(path)
except OutputNotFoundError:
path = PathInfo(os.path.join(repo.root_dir, self.def_path))
# We are polluting our repo cache with some dir listing here
return self.repo.cache.local.get_checksum(path)

def status(self):
current_checksum = self._get_checksum(updated=False)
updated_checksum = self._get_checksum(updated=True)
current_checksum = self._get_checksum(locked=True)
updated_checksum = self._get_checksum(locked=False)

if current_checksum != updated_checksum:
return {str(self): "update available"}
Expand All @@ -84,71 +63,16 @@ def save(self):
def dumpd(self):
return {self.PARAM_PATH: self.def_path, self.PARAM_REPO: self.def_repo}

def fetch(self):
with self._make_repo(
cache_dir=self.repo.cache.local.cache_dir
) as repo:
self.def_repo[self.PARAM_REV_LOCK] = repo.scm.get_rev()
def download(self, to):
with self._make_repo() as repo:
if self.def_repo.get(self.PARAM_REV_LOCK) is None:
self.def_repo[self.PARAM_REV_LOCK] = repo.scm.get_rev()

out = repo.find_out_by_relpath(self.def_path)
with repo.state:
try:
repo.cloud.pull(out.get_used_cache())
except NoRemoteError:
# It would not be good idea to raise exception if the
# file is already present in the cache
if not self.repo.cache.local.changed_cache(out.checksum):
return out
raise

return out

@staticmethod
def _is_git_file(repo_dir, path):
from dvc.repo import Repo

if os.path.isabs(path):
return False

try:
repo = Repo(repo_dir)
except NotDvcRepoError:
return True

try:
output = repo.find_out_by_relpath(path)
return not output.use_cache
except OutputNotFoundError:
return True
finally:
repo.close()

def _copy_if_git_file(self, to_path):
src_path = self.def_path
repo_dir = cached_clone(**self.def_repo)

if not self._is_git_file(repo_dir, src_path):
return False

src_full_path = os.path.join(repo_dir, src_path)
dst_full_path = os.path.abspath(to_path)
fs_copy(src_full_path, dst_full_path)
self.def_repo[self.PARAM_REV_LOCK] = SCM(repo_dir).get_rev()
return True
if hasattr(repo, "cache"):
repo.cache.local.cache_dir = self.repo.cache.local.cache_dir

def download(self, to):
try:
if self._copy_if_git_file(to.fspath):
return

out = self.fetch()
to.info = copy.copy(out.info)
to.checkout()
except (FileNotFoundError):
raise PathMissingError(
self.def_path, self.def_repo[self.PARAM_URL]
)
repo.pull_to(self.def_path, to.path_info)

def update(self):
with self._make_repo(rev_lock=None) as repo:
with self._make_repo(locked=False) as repo:
self.def_repo[self.PARAM_REV_LOCK] = repo.scm.get_rev()
1 change: 1 addition & 0 deletions dvc/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ def __init__(self, etag, cached_etag):

class FileMissingError(DvcException):
def __init__(self, path):
self.path = path
super().__init__(
"Can't find '{}' neither locally nor on remote".format(path)
)
Expand Down
Loading