Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

get: handle non-DVC repositories #3097

Merged
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions dvc/command/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ def run(self):


def add_parser(subparsers, parent_parser):
GET_HELP = "Download/copy files or directories from DVC repository."
GET_HELP = (
"Download a file or directory from any DVC project or Git repository."
)
get_parser = subparsers.add_parser(
"get",
parents=[parent_parser],
Expand All @@ -40,18 +42,17 @@ def add_parser(subparsers, parent_parser):
formatter_class=argparse.RawDescriptionHelpFormatter,
)
get_parser.add_argument(
"url", help="URL of Git repository with DVC project to download from."
"url",
help="Location of DVC project or Git repository to download from",
jorgeorpinel marked this conversation as resolved.
Show resolved Hide resolved
)
get_parser.add_argument(
"path", help="Path to a file or directory within a DVC repository."
"path",
help="Path to a file or directory within the project or repository",
)
get_parser.add_argument(
"-o",
"--out",
nargs="?",
help="Destination path to copy/download files to.",
"-o", "--out", nargs="?", help="Destination path to download files to"
)
get_parser.add_argument(
"--rev", nargs="?", help="DVC repository git revision."
"--rev", nargs="?", help="Git revision (e.g. branch, tag, SHA)"
)
get_parser.set_defaults(func=CmdGet)
13 changes: 8 additions & 5 deletions dvc/command/imp.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ def run(self):

def add_parser(subparsers, parent_parser):
IMPORT_HELP = (
"Download data from DVC repository and take it under DVC control."
"Download a file or directory from any DVC project or Git repository"
"and take it under DVC control."
)

import_parser = subparsers.add_parser(
Expand All @@ -41,15 +42,17 @@ def add_parser(subparsers, parent_parser):
formatter_class=argparse.RawTextHelpFormatter,
)
import_parser.add_argument(
"url", help="URL of Git repository with DVC project to download from."
"url",
help="Location of DVC project or Git repository to download from",
)
import_parser.add_argument(
"path", help="Path to data within DVC repository."
"path",
help="Path to a file or directory within the project or repository",
)
import_parser.add_argument(
"-o", "--out", nargs="?", help="Destination path to put data to."
"-o", "--out", nargs="?", help="Destination path to download files to"
)
import_parser.add_argument(
"--rev", nargs="?", help="DVC repository git revision."
"--rev", nargs="?", help="Git revision (e.g. branch, tag, SHA)"
)
import_parser.set_defaults(func=CmdImport)
5 changes: 0 additions & 5 deletions dvc/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,11 +240,6 @@ def __init__(self, ignore_dirname):
)


class UrlNotDvcRepoError(DvcException):
def __init__(self, url):
super().__init__("URL '{}' is not a dvc repository.".format(url))


class GitHookAlreadyExistsError(DvcException):
def __init__(self, hook_name):
super().__init__(
Expand Down
57 changes: 33 additions & 24 deletions dvc/repo/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@
DvcException,
NotDvcRepoError,
OutputNotFoundError,
UrlNotDvcRepoError,
PathMissingError,
)
from dvc.external_repo import external_repo
from dvc.external_repo import external_repo, cached_clone
from dvc.path_info import PathInfo
from dvc.stage import Stage
from dvc.utils import resolve_output
Expand Down Expand Up @@ -42,39 +41,49 @@ def get(url, path, out=None, rev=None):
# and won't work with reflink/hardlink.
dpath = os.path.dirname(os.path.abspath(out))
tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid()))
raw_git_dir = None
try:
with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo:
# Try any links possible to avoid data duplication.
#
# Not using symlink, because we need to remove cache after we are
# done, and to make that work we would have to copy data over
# anyway before removing the cache, so we might just copy it
# right away.
#
# Also, we can't use theoretical "move" link type here, because
# the same cache file might be used a few times in a directory.
repo.cache.local.cache_types = ["reflink", "hardlink", "copy"]

try:
output = repo.find_out_by_relpath(path)
except OutputNotFoundError:
output = None

if output and output.use_cache:
_get_cached(repo, output, out)
else:
try:
with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo:
# Try any links possible to avoid data duplication.
#
# Not using symlink, because we need to remove cache after we
# are done, and to make that work we would have to copy data
# over anyway before removing the cache, so we might just copy
# it right away.
#
# Also, we can't use theoretical "move" link type here, because
# the same cache file might be used a few times in a directory.
repo.cache.local.cache_types = ["reflink", "hardlink", "copy"]

try:
output = repo.find_out_by_relpath(path)
except OutputNotFoundError:
output = None

if output and output.use_cache:
_get_cached(repo, output, out)
return

# Either an uncached out with absolute path or a user error
if os.path.isabs(path):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Absolute paths are not allowed even for pure-git repos, hence why I've suggested that example that has the same code handling both non-cached and git files. No need to worry much about second cached_clone being redundand, as we cache those anyways, so the overhead is not big.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. I tried to fold it like you demonstrated before, but it looks like the files are missing if I try to do the copy outside of the context manager.

So I extracted the part that checks if the path is absolute into a function and called it twice.

raise FileNotFoundError

fs_copy(os.path.join(repo.root_dir, path), out)
return

except NotDvcRepoError:
# Not a DVC repository, continue below and copy from git
pass

raw_git_dir = cached_clone(url, rev=rev)
fs_copy(os.path.join(raw_git_dir, path), out)
except (OutputNotFoundError, FileNotFoundError):
raise PathMissingError(path, url)
except NotDvcRepoError:
raise UrlNotDvcRepoError(url)
fabiosantoscode marked this conversation as resolved.
Show resolved Hide resolved
finally:
remove(tmp_dir)
if raw_git_dir:
remove(raw_git_dir)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think there is any need in removing it, it is a cached copy after all 🙂 And if you check external_repo, it doesn't delete those either. Or am I missing something?

Copy link
Contributor Author

@fabiosantoscode fabiosantoscode Jan 10, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's good news. I figured there was some automatic cleanup happening but haven't seen how it works yet.



def _get_cached(repo, output, out):
Expand Down
10 changes: 3 additions & 7 deletions tests/func/test_get.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from dvc.cache import Cache
from dvc.config import Config
from dvc.exceptions import UrlNotDvcRepoError
from dvc.repo.get import GetDVCFileError, PathMissingError
from dvc.repo import Repo
from dvc.system import System
Expand Down Expand Up @@ -87,9 +86,10 @@ def test_get_repo_rev(tmp_dir, erepo_dir):
def test_get_from_non_dvc_repo(tmp_dir, erepo_dir):
erepo_dir.scm.repo.index.remove([erepo_dir.dvc.dvc_dir], r=True)
erepo_dir.scm.commit("remove dvc")
erepo_dir.scm_gen({"some_file": "contents"}, commit="create file")

with pytest.raises(UrlNotDvcRepoError):
Repo.get(fspath(erepo_dir), "some_file.zip")
Repo.get(fspath(erepo_dir), "some_file", "file_imported")
assert (tmp_dir / "file_imported").read_text() == "contents"


def test_get_a_dvc_file(tmp_dir, erepo_dir):
Expand Down Expand Up @@ -164,10 +164,6 @@ def test_get_from_non_dvc_master(tmp_dir, erepo_dir, caplog):
erepo_dir.dvc.scm.repo.index.remove([".dvc"], r=True)
erepo_dir.dvc.scm.commit("remove .dvc")

# sanity check
with pytest.raises(UrlNotDvcRepoError):
Repo.get(fspath(erepo_dir), "some_file")

caplog.clear()
dst = "file_imported"
with caplog.at_level(logging.INFO, logger="dvc"):
Expand Down