From 0dd5647d3c3f8fafdfa5bed524d9f0ef3e1842b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Thu, 9 Jan 2020 18:15:25 +0000 Subject: [PATCH 01/20] get: handle non-DVC repositories Allows us to `dvc get` from non-DVC source repositories. Fixes #3089 --- dvc/exceptions.py | 5 ----- dvc/external_repo.py | 4 ++-- dvc/repo/get.py | 39 ++++++++++++++++++++++----------------- tests/func/test_get.py | 10 +++------- 4 files changed, 27 insertions(+), 31 deletions(-) diff --git a/dvc/exceptions.py b/dvc/exceptions.py index e48014e95a..a551e35db1 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -240,11 +240,6 @@ def __init__(self, ignore_dirname): ) -class UrlNotDvcRepoError(DvcException): - def __init__(self, url): - super().__init__("URL '{}' is not a dvc repository.".format(url)) - - class GitHookAlreadyExistsError(DvcException): def __init__(self, hook_name): super().__init__( diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 9ff2f2a413..8f7cfaef55 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -33,7 +33,7 @@ def external_repo(url=None, rev=None, rev_lock=None, cache_dir=None): repo.close() -def cached_clone(url, rev=None, **_ignored_kwargs): +def cached_clone(url, rev=None, clone_path=None, **_ignored_kwargs): """Clone an external git repo to a temporary directory. Returns the path to a local temporary directory with the specified @@ -44,7 +44,7 @@ def cached_clone(url, rev=None, **_ignored_kwargs): """ - new_path = tempfile.mkdtemp("dvc-erepo") + new_path = clone_path or tempfile.mkdtemp("dvc-erepo") # Copy and adjust existing clean clone if (url, None, None) in REPO_CACHE: diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 17607bbfae..58eddfcbe9 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -7,10 +7,9 @@ DvcException, NotDvcRepoError, OutputNotFoundError, - UrlNotDvcRepoError, PathMissingError, ) -from dvc.external_repo import external_repo +from dvc.external_repo import cached_clone from dvc.path_info import PathInfo from dvc.stage import Stage from dvc.utils import resolve_output @@ -28,8 +27,15 @@ def __init__(self): ) +# Dummy exception raised to signal a plain file copy is needed +class _DoPlainCopy(DvcException): + pass + + @staticmethod def get(url, path, out=None, rev=None): + from dvc.repo import Repo + out = resolve_output(path, out) if Stage.is_valid_filename(out): @@ -43,7 +49,8 @@ def get(url, path, out=None, rev=None): dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: - with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: + cached_clone(url, rev=rev, clone_path=tmp_dir) + try: # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we are @@ -53,26 +60,24 @@ def get(url, path, out=None, rev=None): # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. + repo = Repo(tmp_dir) repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] + output = repo.find_out_by_relpath(path) + if not output.use_cache: + # Catch this below and go for a plain old fs_copy + raise _DoPlainCopy + _get_cached(repo, output, out) - try: - output = repo.find_out_by_relpath(path) - except OutputNotFoundError: - output = None - - if output and output.use_cache: - _get_cached(repo, output, out) - else: - # Either an uncached out with absolute path or a user error - if os.path.isabs(path): - raise FileNotFoundError + except (NotDvcRepoError, OutputNotFoundError, _DoPlainCopy): + # It's an uncached out with absolute path, a non-DVC repo, or a + # user error + if os.path.isabs(path): + raise FileNotFoundError - fs_copy(os.path.join(repo.root_dir, path), out) + fs_copy(os.path.join(tmp_dir, path), out) except (OutputNotFoundError, FileNotFoundError): raise PathMissingError(path, url) - except NotDvcRepoError: - raise UrlNotDvcRepoError(url) finally: remove(tmp_dir) diff --git a/tests/func/test_get.py b/tests/func/test_get.py index cb2d050767..107d3dcc8c 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -5,7 +5,6 @@ from dvc.cache import Cache from dvc.config import Config -from dvc.exceptions import UrlNotDvcRepoError from dvc.repo.get import GetDVCFileError, PathMissingError from dvc.repo import Repo from dvc.system import System @@ -87,9 +86,10 @@ def test_get_repo_rev(tmp_dir, erepo_dir): def test_get_from_non_dvc_repo(tmp_dir, erepo_dir): erepo_dir.scm.repo.index.remove([erepo_dir.dvc.dvc_dir], r=True) erepo_dir.scm.commit("remove dvc") + erepo_dir.scm_gen({"some_file": "contents"}, commit="create file") - with pytest.raises(UrlNotDvcRepoError): - Repo.get(fspath(erepo_dir), "some_file.zip") + Repo.get(fspath(erepo_dir), "some_file", "file_imported") + assert (tmp_dir / "file_imported").read_text() == "contents" def test_get_a_dvc_file(tmp_dir, erepo_dir): @@ -164,10 +164,6 @@ def test_get_from_non_dvc_master(tmp_dir, erepo_dir, caplog): erepo_dir.dvc.scm.repo.index.remove([".dvc"], r=True) erepo_dir.dvc.scm.commit("remove .dvc") - # sanity check - with pytest.raises(UrlNotDvcRepoError): - Repo.get(fspath(erepo_dir), "some_file") - caplog.clear() dst = "file_imported" with caplog.at_level(logging.INFO, logger="dvc"): From 07637a6f8f9705fc21b8dbb9f0b03df68aac1c32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 14:00:26 +0000 Subject: [PATCH 02/20] style: improve code flow and move comments --- dvc/repo/get.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 58eddfcbe9..785c388a4a 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -51,6 +51,8 @@ def get(url, path, out=None, rev=None): try: cached_clone(url, rev=rev, clone_path=tmp_dir) try: + repo = Repo(tmp_dir) + # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we are @@ -60,21 +62,23 @@ def get(url, path, out=None, rev=None): # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. - repo = Repo(tmp_dir) repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] + output = repo.find_out_by_relpath(path) - if not output.use_cache: + if output.use_cache: # Catch this below and go for a plain old fs_copy - raise _DoPlainCopy - _get_cached(repo, output, out) + _get_cached(repo, output, out) + return + + except (NotDvcRepoError, OutputNotFoundError): + pass - except (NotDvcRepoError, OutputNotFoundError, _DoPlainCopy): - # It's an uncached out with absolute path, a non-DVC repo, or a - # user error - if os.path.isabs(path): - raise FileNotFoundError + # It's an uncached out with absolute path, a non-DVC repo, or a + # user error + if os.path.isabs(path): + raise FileNotFoundError - fs_copy(os.path.join(tmp_dir, path), out) + fs_copy(os.path.join(tmp_dir, path), out) except (OutputNotFoundError, FileNotFoundError): raise PathMissingError(path, url) From f126f2483b8f775a89ac17e259398298850c8b59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 14:10:35 +0000 Subject: [PATCH 03/20] doc: change get documentation so that it doesn't imply the target must be DVC enabled --- dvc/command/get.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dvc/command/get.py b/dvc/command/get.py index 9aba35c0d8..492d7efba4 100644 --- a/dvc/command/get.py +++ b/dvc/command/get.py @@ -31,7 +31,7 @@ def run(self): def add_parser(subparsers, parent_parser): - GET_HELP = "Download/copy files or directories from DVC repository." + GET_HELP = "Download/copy files or directories from git repository." get_parser = subparsers.add_parser( "get", parents=[parent_parser], @@ -40,10 +40,10 @@ def add_parser(subparsers, parent_parser): formatter_class=argparse.RawDescriptionHelpFormatter, ) get_parser.add_argument( - "url", help="URL of Git repository with DVC project to download from." + "url", help="URL of Git repository to download from." ) get_parser.add_argument( - "path", help="Path to a file or directory within a DVC repository." + "path", help="Path to a file or directory within the repository." ) get_parser.add_argument( "-o", @@ -52,6 +52,6 @@ def add_parser(subparsers, parent_parser): help="Destination path to copy/download files to.", ) get_parser.add_argument( - "--rev", nargs="?", help="DVC repository git revision." + "--rev", nargs="?", help="Repository git revision." ) get_parser.set_defaults(func=CmdGet) From 86b9a1cfe4718ff3b3fdc44438d1e02a9f45a52f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 16:48:09 +0000 Subject: [PATCH 04/20] get: recoup cache optimal location --- dvc/external_repo.py | 4 ++-- dvc/repo/get.py | 11 ++++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 8f7cfaef55..9ff2f2a413 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -33,7 +33,7 @@ def external_repo(url=None, rev=None, rev_lock=None, cache_dir=None): repo.close() -def cached_clone(url, rev=None, clone_path=None, **_ignored_kwargs): +def cached_clone(url, rev=None, **_ignored_kwargs): """Clone an external git repo to a temporary directory. Returns the path to a local temporary directory with the specified @@ -44,7 +44,7 @@ def cached_clone(url, rev=None, clone_path=None, **_ignored_kwargs): """ - new_path = clone_path or tempfile.mkdtemp("dvc-erepo") + new_path = tempfile.mkdtemp("dvc-erepo") # Copy and adjust existing clean clone if (url, None, None) in REPO_CACHE: diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 785c388a4a..49bb4e9013 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -3,6 +3,8 @@ import shortuuid +from dvc.cache import CacheConfig +from dvc.config import Config from dvc.exceptions import ( DvcException, NotDvcRepoError, @@ -49,9 +51,11 @@ def get(url, path, out=None, rev=None): dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: - cached_clone(url, rev=rev, clone_path=tmp_dir) + clone_dir = cached_clone(url, rev=rev) try: - repo = Repo(tmp_dir) + repo = Repo(clone_dir) + cache_config = CacheConfig(repo.config) + cache_config.set_dir(tmp_dir, level=Config.LEVEL_LOCAL) # Try any links possible to avoid data duplication. # @@ -78,12 +82,13 @@ def get(url, path, out=None, rev=None): if os.path.isabs(path): raise FileNotFoundError - fs_copy(os.path.join(tmp_dir, path), out) + fs_copy(os.path.join(clone_dir, path), out) except (OutputNotFoundError, FileNotFoundError): raise PathMissingError(path, url) finally: remove(tmp_dir) + remove(clone_dir) def _get_cached(repo, output, out): From a01f81ed096018c8e729f0884223357233d76e56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 16:58:53 +0000 Subject: [PATCH 05/20] Update dvc/repo/get.py Co-Authored-By: Ruslan Kuprieiev --- dvc/repo/get.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 49bb4e9013..4c1f554237 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -70,7 +70,6 @@ def get(url, path, out=None, rev=None): output = repo.find_out_by_relpath(path) if output.use_cache: - # Catch this below and go for a plain old fs_copy _get_cached(repo, output, out) return From 6cb7308cdb4e5004d0429d54b3355bc0d482836c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 16:59:40 +0000 Subject: [PATCH 06/20] Revert "get: recoup cache optimal location" This reverts commit 8e4922800f6ce47210976764af1c05e31e90deac. --- dvc/external_repo.py | 4 ++-- dvc/repo/get.py | 11 +++-------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 9ff2f2a413..8f7cfaef55 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -33,7 +33,7 @@ def external_repo(url=None, rev=None, rev_lock=None, cache_dir=None): repo.close() -def cached_clone(url, rev=None, **_ignored_kwargs): +def cached_clone(url, rev=None, clone_path=None, **_ignored_kwargs): """Clone an external git repo to a temporary directory. Returns the path to a local temporary directory with the specified @@ -44,7 +44,7 @@ def cached_clone(url, rev=None, **_ignored_kwargs): """ - new_path = tempfile.mkdtemp("dvc-erepo") + new_path = clone_path or tempfile.mkdtemp("dvc-erepo") # Copy and adjust existing clean clone if (url, None, None) in REPO_CACHE: diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 4c1f554237..19028861c8 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -3,8 +3,6 @@ import shortuuid -from dvc.cache import CacheConfig -from dvc.config import Config from dvc.exceptions import ( DvcException, NotDvcRepoError, @@ -51,11 +49,9 @@ def get(url, path, out=None, rev=None): dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: - clone_dir = cached_clone(url, rev=rev) + cached_clone(url, rev=rev, clone_path=tmp_dir) try: - repo = Repo(clone_dir) - cache_config = CacheConfig(repo.config) - cache_config.set_dir(tmp_dir, level=Config.LEVEL_LOCAL) + repo = Repo(tmp_dir) # Try any links possible to avoid data duplication. # @@ -81,13 +77,12 @@ def get(url, path, out=None, rev=None): if os.path.isabs(path): raise FileNotFoundError - fs_copy(os.path.join(clone_dir, path), out) + fs_copy(os.path.join(tmp_dir, path), out) except (OutputNotFoundError, FileNotFoundError): raise PathMissingError(path, url) finally: remove(tmp_dir) - remove(clone_dir) def _get_cached(repo, output, out): From 7873abcadeb3e992b1f68470fa021acf84eb713b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 17:00:04 +0000 Subject: [PATCH 07/20] remove unused exception class --- dvc/repo/get.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 19028861c8..0585cbfc97 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -27,11 +27,6 @@ def __init__(self): ) -# Dummy exception raised to signal a plain file copy is needed -class _DoPlainCopy(DvcException): - pass - - @staticmethod def get(url, path, out=None, rev=None): from dvc.repo import Repo From 6a4f4031b0aac1c8859424703088df903746a6c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 17:29:21 +0000 Subject: [PATCH 08/20] change command doc string --- dvc/command/get.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/command/get.py b/dvc/command/get.py index 492d7efba4..1b1e1bffab 100644 --- a/dvc/command/get.py +++ b/dvc/command/get.py @@ -31,7 +31,7 @@ def run(self): def add_parser(subparsers, parent_parser): - GET_HELP = "Download/copy files or directories from git repository." + GET_HELP = "Download/copy files or directories from Git repository." get_parser = subparsers.add_parser( "get", parents=[parent_parser], From 51f2fa184fcc73a62a6c8a08a3896eb3ebcb6b6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 18:07:53 +0000 Subject: [PATCH 09/20] Update dvc/command/get.py Co-Authored-By: Jorge Orpinel --- dvc/command/get.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/command/get.py b/dvc/command/get.py index 1b1e1bffab..90cac549c7 100644 --- a/dvc/command/get.py +++ b/dvc/command/get.py @@ -52,6 +52,6 @@ def add_parser(subparsers, parent_parser): help="Destination path to copy/download files to.", ) get_parser.add_argument( - "--rev", nargs="?", help="Repository git revision." + "--rev", nargs="?", help="Git revision (e.g. branch, tag, SHA)" ) get_parser.set_defaults(func=CmdGet) From 04619d559b85b06366936c11b6f748952be0f280 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 18:08:07 +0000 Subject: [PATCH 10/20] Update dvc/command/get.py Co-Authored-By: Jorge Orpinel --- dvc/command/get.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/command/get.py b/dvc/command/get.py index 90cac549c7..3774bd9cb9 100644 --- a/dvc/command/get.py +++ b/dvc/command/get.py @@ -43,7 +43,7 @@ def add_parser(subparsers, parent_parser): "url", help="URL of Git repository to download from." ) get_parser.add_argument( - "path", help="Path to a file or directory within the repository." + "path", help="Path to a file or directory within the project or repository" ) get_parser.add_argument( "-o", From 9a98622020d1e1c142b35b4afa4fd6b1011fbc36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 18:08:33 +0000 Subject: [PATCH 11/20] Update dvc/command/get.py Co-Authored-By: Jorge Orpinel --- dvc/command/get.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/command/get.py b/dvc/command/get.py index 3774bd9cb9..daae7fa266 100644 --- a/dvc/command/get.py +++ b/dvc/command/get.py @@ -40,7 +40,7 @@ def add_parser(subparsers, parent_parser): formatter_class=argparse.RawDescriptionHelpFormatter, ) get_parser.add_argument( - "url", help="URL of Git repository to download from." + "url", help="Location of DVC project or Git repository to download from" ) get_parser.add_argument( "path", help="Path to a file or directory within the project or repository" From 5b36ffd60e62aa389ce2c386d26b7f0c696b9441 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 18:08:46 +0000 Subject: [PATCH 12/20] Update dvc/command/get.py Co-Authored-By: Jorge Orpinel --- dvc/command/get.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/command/get.py b/dvc/command/get.py index daae7fa266..7d5af18c79 100644 --- a/dvc/command/get.py +++ b/dvc/command/get.py @@ -31,7 +31,7 @@ def run(self): def add_parser(subparsers, parent_parser): - GET_HELP = "Download/copy files or directories from Git repository." + GET_HELP = "Download a file or directory from any DVC project or Git repository" get_parser = subparsers.add_parser( "get", parents=[parent_parser], From c16f8825dc47fe7e1d8594c3a47eae497407c604 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 18:27:04 +0000 Subject: [PATCH 13/20] import: update documentation Fixes #3077 --- dvc/command/get.py | 10 +++++++--- dvc/command/imp.py | 12 ++++++------ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/dvc/command/get.py b/dvc/command/get.py index 7d5af18c79..38ad490003 100644 --- a/dvc/command/get.py +++ b/dvc/command/get.py @@ -31,7 +31,9 @@ def run(self): def add_parser(subparsers, parent_parser): - GET_HELP = "Download a file or directory from any DVC project or Git repository" + GET_HELP = ( + "Download a file or directory from any DVC project or Git repository" + ) get_parser = subparsers.add_parser( "get", parents=[parent_parser], @@ -40,10 +42,12 @@ def add_parser(subparsers, parent_parser): formatter_class=argparse.RawDescriptionHelpFormatter, ) get_parser.add_argument( - "url", help="Location of DVC project or Git repository to download from" + "url", + help="Location of DVC project or Git repository to download from.", ) get_parser.add_argument( - "path", help="Path to a file or directory within the project or repository" + "path", + help="Path to a file or directory within the project or repository", ) get_parser.add_argument( "-o", diff --git a/dvc/command/imp.py b/dvc/command/imp.py index 9315f03744..b3fe61a1b8 100644 --- a/dvc/command/imp.py +++ b/dvc/command/imp.py @@ -30,7 +30,8 @@ def run(self): def add_parser(subparsers, parent_parser): IMPORT_HELP = ( - "Download data from DVC repository and take it under DVC control." + "Download data from DVC project or Git repository and take it under " + "DVC control." ) import_parser = subparsers.add_parser( @@ -41,13 +42,12 @@ def add_parser(subparsers, parent_parser): formatter_class=argparse.RawTextHelpFormatter, ) import_parser.add_argument( - "url", help="URL of Git repository with DVC project to download from." + "url", + help="Location of Git repository with DVC project to download from.", ) + import_parser.add_argument("path", help="Path to data within DVC project.") import_parser.add_argument( - "path", help="Path to data within DVC repository." - ) - import_parser.add_argument( - "-o", "--out", nargs="?", help="Destination path to put data to." + "-o", "--out", nargs="?", help="Destination path to put data in." ) import_parser.add_argument( "--rev", nargs="?", help="DVC repository git revision." From f5e40b2a4746ef68ce59288897a58dbc7cbfa5c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 22:22:28 +0000 Subject: [PATCH 14/20] get: leverage external_repo() context manager for DVC repositories --- dvc/command/get.py | 6 ++--- dvc/command/imp.py | 10 +++---- dvc/external_repo.py | 4 +-- dvc/repo/get.py | 63 ++++++++++++++++++++++++-------------------- 4 files changed, 45 insertions(+), 38 deletions(-) diff --git a/dvc/command/get.py b/dvc/command/get.py index 38ad490003..97d9317d71 100644 --- a/dvc/command/get.py +++ b/dvc/command/get.py @@ -32,7 +32,7 @@ def run(self): def add_parser(subparsers, parent_parser): GET_HELP = ( - "Download a file or directory from any DVC project or Git repository" + "Download a file or directory from any DVC project or Git repository." ) get_parser = subparsers.add_parser( "get", @@ -43,7 +43,7 @@ def add_parser(subparsers, parent_parser): ) get_parser.add_argument( "url", - help="Location of DVC project or Git repository to download from.", + help="Location of DVC project or Git repository to download from", ) get_parser.add_argument( "path", @@ -53,7 +53,7 @@ def add_parser(subparsers, parent_parser): "-o", "--out", nargs="?", - help="Destination path to copy/download files to.", + help="Destination path to download files to", ) get_parser.add_argument( "--rev", nargs="?", help="Git revision (e.g. branch, tag, SHA)" diff --git a/dvc/command/imp.py b/dvc/command/imp.py index b3fe61a1b8..8797d3c05c 100644 --- a/dvc/command/imp.py +++ b/dvc/command/imp.py @@ -30,7 +30,7 @@ def run(self): def add_parser(subparsers, parent_parser): IMPORT_HELP = ( - "Download data from DVC project or Git repository and take it under " + "Download a file or directory from any DVC project or Git repository and take it under " "DVC control." ) @@ -43,13 +43,13 @@ def add_parser(subparsers, parent_parser): ) import_parser.add_argument( "url", - help="Location of Git repository with DVC project to download from.", + help="Location of DVC project or Git repository to download from", ) - import_parser.add_argument("path", help="Path to data within DVC project.") + import_parser.add_argument("path", help="Path to a file or directory within the project or repository") import_parser.add_argument( - "-o", "--out", nargs="?", help="Destination path to put data in." + "-o", "--out", nargs="?", help="Destination path to download files to" ) import_parser.add_argument( - "--rev", nargs="?", help="DVC repository git revision." + "--rev", nargs="?", help="Git revision (e.g. branch, tag, SHA)" ) import_parser.set_defaults(func=CmdImport) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 8f7cfaef55..9ff2f2a413 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -33,7 +33,7 @@ def external_repo(url=None, rev=None, rev_lock=None, cache_dir=None): repo.close() -def cached_clone(url, rev=None, clone_path=None, **_ignored_kwargs): +def cached_clone(url, rev=None, **_ignored_kwargs): """Clone an external git repo to a temporary directory. Returns the path to a local temporary directory with the specified @@ -44,7 +44,7 @@ def cached_clone(url, rev=None, clone_path=None, **_ignored_kwargs): """ - new_path = clone_path or tempfile.mkdtemp("dvc-erepo") + new_path = tempfile.mkdtemp("dvc-erepo") # Copy and adjust existing clean clone if (url, None, None) in REPO_CACHE: diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 0585cbfc97..745e24fe1c 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -9,7 +9,7 @@ OutputNotFoundError, PathMissingError, ) -from dvc.external_repo import cached_clone +from dvc.external_repo import external_repo, cached_clone from dvc.path_info import PathInfo from dvc.stage import Stage from dvc.utils import resolve_output @@ -29,8 +29,6 @@ def __init__(self): @staticmethod def get(url, path, out=None, rev=None): - from dvc.repo import Repo - out = resolve_output(path, out) if Stage.is_valid_filename(out): @@ -43,41 +41,50 @@ def get(url, path, out=None, rev=None): # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) + raw_git_dir = None try: - cached_clone(url, rev=rev, clone_path=tmp_dir) try: - repo = Repo(tmp_dir) - - # Try any links possible to avoid data duplication. - # - # Not using symlink, because we need to remove cache after we are - # done, and to make that work we would have to copy data over - # anyway before removing the cache, so we might just copy it - # right away. - # - # Also, we can't use theoretical "move" link type here, because - # the same cache file might be used a few times in a directory. - repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] - - output = repo.find_out_by_relpath(path) - if output.use_cache: - _get_cached(repo, output, out) + with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: + # Try any links possible to avoid data duplication. + # + # Not using symlink, because we need to remove cache after we + # are done, and to make that work we would have to copy data + # over anyway before removing the cache, so we might just copy + # it right away. + # + # Also, we can't use theoretical "move" link type here, because + # the same cache file might be used a few times in a directory. + repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] + + try: + output = repo.find_out_by_relpath(path) + except OutputNotFoundError: + output = None + + if output and output.use_cache: + _get_cached(repo, output, out) + return + + # Either an uncached out with absolute path or a user error + + if os.path.isabs(path): + raise FileNotFoundError + + fs_copy(os.path.join(repo.root_dir, path), out) return - except (NotDvcRepoError, OutputNotFoundError): + except NotDvcRepoError: + # Not a DVC repository, continue below and copy from git pass - # It's an uncached out with absolute path, a non-DVC repo, or a - # user error - if os.path.isabs(path): - raise FileNotFoundError - - fs_copy(os.path.join(tmp_dir, path), out) - + raw_git_dir = cached_clone(url, rev=rev) + fs_copy(os.path.join(raw_git_dir, path), out) except (OutputNotFoundError, FileNotFoundError): raise PathMissingError(path, url) finally: remove(tmp_dir) + if raw_git_dir: + remove(raw_git_dir) def _get_cached(repo, output, out): From 5fb6640867e63dcb8b69bc6f2c7fbf0be73e99bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 22:29:27 +0000 Subject: [PATCH 15/20] run black --- dvc/command/get.py | 5 +---- dvc/command/imp.py | 9 ++++++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dvc/command/get.py b/dvc/command/get.py index 97d9317d71..22856f1c07 100644 --- a/dvc/command/get.py +++ b/dvc/command/get.py @@ -50,10 +50,7 @@ def add_parser(subparsers, parent_parser): help="Path to a file or directory within the project or repository", ) get_parser.add_argument( - "-o", - "--out", - nargs="?", - help="Destination path to download files to", + "-o", "--out", nargs="?", help="Destination path to download files to" ) get_parser.add_argument( "--rev", nargs="?", help="Git revision (e.g. branch, tag, SHA)" diff --git a/dvc/command/imp.py b/dvc/command/imp.py index 8797d3c05c..2a8d515ffd 100644 --- a/dvc/command/imp.py +++ b/dvc/command/imp.py @@ -30,8 +30,8 @@ def run(self): def add_parser(subparsers, parent_parser): IMPORT_HELP = ( - "Download a file or directory from any DVC project or Git repository and take it under " - "DVC control." + "Download a file or directory from any DVC project or Git repository" + "and take it under DVC control." ) import_parser = subparsers.add_parser( @@ -45,7 +45,10 @@ def add_parser(subparsers, parent_parser): "url", help="Location of DVC project or Git repository to download from", ) - import_parser.add_argument("path", help="Path to a file or directory within the project or repository") + import_parser.add_argument( + "path", + help="Path to a file or directory within the project or repository", + ) import_parser.add_argument( "-o", "--out", nargs="?", help="Destination path to download files to" ) From 0e2ef470e82a1e299d9179870d0995069a335a41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 22:53:31 +0000 Subject: [PATCH 16/20] remove blank line --- dvc/repo/get.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 745e24fe1c..588e13ba1c 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -66,7 +66,6 @@ def get(url, path, out=None, rev=None): return # Either an uncached out with absolute path or a user error - if os.path.isabs(path): raise FileNotFoundError From 9f279b396846b7038840074563c1ac9c715f7ce8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 23:15:03 +0000 Subject: [PATCH 17/20] forbid absolute paths for plain git repositories --- dvc/repo/get.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 588e13ba1c..c2a187c67b 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -27,6 +27,11 @@ def __init__(self): ) +def _forbid_absolute_path(path): + if os.path.isabs(path): + raise FileNotFoundError + + @staticmethod def get(url, path, out=None, rev=None): out = resolve_output(path, out) @@ -66,8 +71,7 @@ def get(url, path, out=None, rev=None): return # Either an uncached out with absolute path or a user error - if os.path.isabs(path): - raise FileNotFoundError + _forbid_absolute_path(path) fs_copy(os.path.join(repo.root_dir, path), out) return @@ -76,6 +80,7 @@ def get(url, path, out=None, rev=None): # Not a DVC repository, continue below and copy from git pass + _forbid_absolute_path(path) raw_git_dir = cached_clone(url, rev=rev) fs_copy(os.path.join(raw_git_dir, path), out) except (OutputNotFoundError, FileNotFoundError): From 1f2eed80f8665b5630ac7bc341790d1d0f5cd567 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 23:16:19 +0000 Subject: [PATCH 18/20] don't manually clean up git repository --- dvc/repo/get.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index c2a187c67b..a1f99f671b 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -46,7 +46,6 @@ def get(url, path, out=None, rev=None): # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) - raw_git_dir = None try: try: with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: @@ -87,8 +86,6 @@ def get(url, path, out=None, rev=None): raise PathMissingError(path, url) finally: remove(tmp_dir) - if raw_git_dir: - remove(raw_git_dir) def _get_cached(repo, output, out): From f3d234ec9cb933009d39531e15edcd80e2059a5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 23:26:37 +0000 Subject: [PATCH 19/20] deduplicate code --- dvc/repo/get.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index a1f99f671b..3c417556c8 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -27,11 +27,6 @@ def __init__(self): ) -def _forbid_absolute_path(path): - if os.path.isabs(path): - raise FileNotFoundError - - @staticmethod def get(url, path, out=None, rev=None): out = resolve_output(path, out) @@ -46,6 +41,7 @@ def get(url, path, out=None, rev=None): # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) + repo_dir = None try: try: with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: @@ -70,18 +66,20 @@ def get(url, path, out=None, rev=None): return # Either an uncached out with absolute path or a user error - _forbid_absolute_path(path) - fs_copy(os.path.join(repo.root_dir, path), out) - return + repo_dir = repo.root_dir except NotDvcRepoError: # Not a DVC repository, continue below and copy from git pass - _forbid_absolute_path(path) - raw_git_dir = cached_clone(url, rev=rev) - fs_copy(os.path.join(raw_git_dir, path), out) + if os.path.isabs(path): + raise FileNotFoundError + + if not repo_dir: + repo_dir = cached_clone(url, rev=rev) + + fs_copy(os.path.join(repo_dir, path), out) except (OutputNotFoundError, FileNotFoundError): raise PathMissingError(path, url) finally: From de303d45ba987b3265851ed1768f20b21995b7e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 23:30:18 +0000 Subject: [PATCH 20/20] test removing a file outside a raw git repo --- tests/func/test_get.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/func/test_get.py b/tests/func/test_get.py index 107d3dcc8c..cdbbe4bbdd 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -136,6 +136,14 @@ def test_absolute_file_outside_repo(tmp_dir, erepo_dir): Repo.get(fspath(erepo_dir), "/root/") +def test_absolute_file_outside_git_repo(tmp_dir, erepo_dir): + erepo_dir.scm.repo.index.remove([erepo_dir.dvc.dvc_dir], r=True) + erepo_dir.scm.commit("remove dvc") + + with pytest.raises(PathMissingError): + Repo.get(fspath(erepo_dir), "/root/") + + def test_unknown_path(tmp_dir, erepo_dir): with pytest.raises(PathMissingError): Repo.get(fspath(erepo_dir), "a_non_existing_file")