Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

get: copy/download files tracked by Git #2837

Merged
merged 23 commits into from
Dec 9, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions dvc/command/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def run(self):


def add_parser(subparsers, parent_parser):
GET_HELP = "Download data from DVC repository."
GET_HELP = "Download/copy files or directories from DVC repository."
get_parser = subparsers.add_parser(
"get",
parents=[parent_parser],
Expand All @@ -44,9 +44,14 @@ def add_parser(subparsers, parent_parser):
get_parser.add_argument(
"url", help="URL of Git repository with DVC project to download from."
)
get_parser.add_argument("path", help="Path to data within DVC repository.")
get_parser.add_argument(
"-o", "--out", nargs="?", help="Destination path to put data to."
"path", help="Path to a file or directory within a DVC repository."
jorgeorpinel marked this conversation as resolved.
Show resolved Hide resolved
)
get_parser.add_argument(
"-o",
"--out",
nargs="?",
help="Destination path to copy/download files to.",
)
get_parser.add_argument(
"--rev", nargs="?", help="DVC repository git revision."
Expand Down
6 changes: 6 additions & 0 deletions dvc/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,12 @@ def __init__(self, path, cause=None):
)


class PathOutsideRepoError(DvcException):
def __init__(self, path, repo):
msg = "The path '{}' does not exist in the target repository '{}'."
super(PathOutsideRepoError, self).__init__(msg.format(path, repo))


class DvcIgnoreInCollectedDirError(DvcException):
def __init__(self, ignore_dirname):
super(DvcIgnoreInCollectedDirError, self).__init__(
Expand Down
46 changes: 39 additions & 7 deletions dvc/repo/get.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,39 @@
import logging
import os
import shutil

import shortuuid

from dvc.exceptions import GetDVCFileError
from dvc.exceptions import NotDvcRepoError
from dvc.exceptions import OutputNotFoundError
from dvc.exceptions import UrlNotDvcRepoError
from dvc.exceptions import PathOutsideRepoError
from dvc.external_repo import external_repo
from dvc.path_info import PathInfo
from dvc.stage import Stage
from dvc.state import StateNoop
from dvc.utils import resolve_output
from dvc.utils.fs import remove
from dvc.utils.compat import FileNotFoundError

logger = logging.getLogger(__name__)


def _copy_git_file(repo, src, dst, repo_url):
src_full_path = os.path.join(repo.root_dir, src)
dst_full_path = os.path.abspath(dst)

if os.path.isdir(src_full_path):
shutil.copytree(src_full_path, dst_full_path)
return

try:
shutil.copy2(src_full_path, dst_full_path)
except FileNotFoundError:
efiop marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

@efiop efiop Dec 6, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are not really raсing against anything, so how about we

if not os.path.exists(src_full_path):
    raise PathOutsideRepoError(src, repo_url)

before "if os.path.isdir()" instead of wrapping it in try&except, to make it more linear? Looks like shutil.copy2(src_full_path, dst_full_path) is the only line that could raise this exception, as isdir will return False on non-existing path. Seems like it would make it easier to grasp. I don't have a strong opinion here. What do you think?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Like so?

def _copy_git_file(repo, src, dst, repo_url):
    src_full_path = os.path.join(repo.root_dir, src)
    dst_full_path = os.path.abspath(dst)

    if os.path.isdir(src_full_path):
        shutil.copytree(src_full_path, dst_full_path)
        return

    try:
        shutil.copy2(src_full_path, dst_full_path)
    except FileNotFoundError:
        raise PathOutsideRepoError(src, repo_url)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@danihodovic Well, that would do it for me too πŸ˜„ Thanks! πŸ™‚

raise PathOutsideRepoError(src, repo_url)


@staticmethod
def get(url, path, out=None, rev=None):
out = resolve_output(path, out)
Expand Down Expand Up @@ -49,16 +66,31 @@ def get(url, path, out=None, rev=None):
# the same cache file might be used a few times in a directory.
repo.cache.local.cache_types = ["reflink", "hardlink", "copy"]

o = repo.find_out_by_relpath(path)
output = None
output_error = None

try:
output = repo.find_out_by_relpath(path)
except OutputNotFoundError as ex:
efiop marked this conversation as resolved.
Show resolved Hide resolved
output_error = ex

is_git_file = output_error and not os.path.isabs(path)
is_not_cached = output and not output.use_cache
Comment on lines +77 to +78
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it is not cached then this is also a git file, so this var names are confusing. Overall this logic is more complicated than necessary. It is simply either cached or a git managed file, so:

try:
    if out and out.use_cache:
        # do the pull and checkout
        ...
    else:
        # Non cached out outside repo, can't be handled
        if os.path.abspath(src):
            raise PathOutsideRepoError(...)

        # it's git-handled and already checked out to tmp dir
        # we should just copy, not a git specific operation
        ...
        copy_dir_or_file(src_full, dst_full)  
except FileNotFoundError:
    raise FileMissingError(...)

And again forgot about generic exception, basically:

$ dvc get http://some.repo non-existing-path
Can't find non-existing-path in some.repo neither as output 
nor as git handled file/dir

Message may be different, but the idea is that we don't know whether user tried to get an out or a git handled file.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Suor We've agreed to not use FileMissingError as its message is not applicable here. Hence PathOutsideRepoError, which is more suitable. The current logic corresponds to what we have in Repo.open.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed on the naming.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And indeed output_error is not wrapped as it is in repo.open.

Copy link
Contributor

@efiop efiop Dec 7, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking closer at Repo.open, it indeed has a more clear implementation than this, as it doesn't introduce is_git_file confusion.


if is_git_file or is_not_cached:
_copy_git_file(repo, path, out, url)
return

if output_error:
raise OutputNotFoundError(path)

with repo.state:
repo.cloud.pull(o.get_used_cache())
o.path_info = PathInfo(os.path.abspath(out))
with o.repo.state:
o.checkout()
repo.cloud.pull(output.get_used_cache())
output.path_info = PathInfo(os.path.abspath(out))
with output.repo.state:
output.checkout()

except NotDvcRepoError:
raise UrlNotDvcRepoError(url)
except OutputNotFoundError:
raise OutputNotFoundError(path)
finally:
remove(tmp_dir)
78 changes: 78 additions & 0 deletions tests/func/test_get.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,13 @@
from dvc.config import Config
from dvc.exceptions import GetDVCFileError
from dvc.exceptions import UrlNotDvcRepoError
from dvc.exceptions import OutputNotFoundError
from dvc.exceptions import PathOutsideRepoError
from dvc.repo import Repo
from dvc.system import System
from dvc.utils import makedirs
from dvc.utils.compat import fspath
from dvc.utils import fspath_py35
from tests.utils import trees_equal


Expand All @@ -38,6 +41,36 @@ def test_get_repo_dir(erepo):
trees_equal(src, dst)


def test_get_regular_file(erepo):
src = "some_file"
dst = "some_file_imported"

src_path = os.path.join(erepo.root_dir, src)
erepo.create(src_path, "hello")
erepo.dvc.scm.add([src_path])
erepo.dvc.scm.commit("add a regular file")
Repo.get(erepo.root_dir, src, dst)
danihodovic marked this conversation as resolved.
Show resolved Hide resolved

assert os.path.exists(dst)
assert os.path.isfile(dst)
assert filecmp.cmp(src_path, dst, shallow=False)


def test_get_regular_dir(erepo):
src = "some_directory"
dst = "some_directory_imported"

src_file_path = os.path.join(erepo.root_dir, src, "file.txt")
erepo.create(src_file_path, "hello")
erepo.dvc.scm.add([src_file_path])
erepo.dvc.scm.commit("add a regular dir")
Repo.get(erepo.root_dir, src, dst)

assert os.path.exists(dst)
assert os.path.isdir(dst)
trees_equal(os.path.join(erepo.root_dir, src), dst)


def test_cache_type_is_properly_overridden(erepo):
erepo.dvc.config.set(
Config.SECTION_CACHE, Config.SECTION_CACHE_TYPE, "symlink"
Expand Down Expand Up @@ -77,6 +110,51 @@ def test_get_a_dvc_file(erepo):
Repo.get(erepo.root_dir, "some_file.dvc")


# https://github.com/iterative/dvc/pull/2837#discussion_r352123053
def test_get_full_dvc_path(erepo):
external_data_dir = erepo.mkdtemp()
external_data = os.path.join(external_data_dir, "ext_data")
with open(external_data, "w+") as fobj:
fobj.write("ext_data")

cur_dir = os.getcwd()
os.chdir(erepo.root_dir)
erepo.dvc.add(external_data)
erepo.dvc.scm.add(["ext_data.dvc"])
erepo.dvc.scm.commit("add external data")
os.chdir(cur_dir)

Repo.get(erepo.root_dir, external_data, "ext_data_imported")
assert os.path.isfile("ext_data_imported")
assert filecmp.cmp(external_data, "ext_data_imported", shallow=False)
Suor marked this conversation as resolved.
Show resolved Hide resolved


def test_non_cached_output(tmp_path, erepo):
os.chdir(erepo.root_dir)
Suor marked this conversation as resolved.
Show resolved Hide resolved
erepo.dvc.run(
outs_no_cache=["non_cached_file"], cmd="echo hello > non_cached_file"
)
erepo.dvc.scm.add(["non_cached_file", "non_cached_file.dvc"])
erepo.dvc.scm.commit("add non-cached output")
os.chdir(fspath_py35(tmp_path))
Repo.get(erepo.root_dir, "non_cached_file")

src = os.path.join(erepo.root_dir, "non_cached_file")
assert os.path.isfile("non_cached_file")
assert filecmp.cmp(src, "non_cached_file", shallow=False)


# https://github.com/iterative/dvc/pull/2837#discussion_r352123053
def test_fails_with_files_outside_repo(erepo):
with pytest.raises(OutputNotFoundError):
Repo.get(erepo.root_dir, "/root/")


def test_fails_with_non_existing_files(erepo):
with pytest.raises(PathOutsideRepoError):
Repo.get(erepo.root_dir, "file_does_not_exist")


@pytest.mark.parametrize("dname", [".", "dir", "dir/subdir"])
def test_get_to_dir(dname, erepo):
src = erepo.FOO
Expand Down