diff --git a/dvc/command/add.py b/dvc/command/add.py index 7028830d2f..18daa4573a 100644 --- a/dvc/command/add.py +++ b/dvc/command/add.py @@ -18,6 +18,7 @@ def run(self): recursive=self.args.recursive, no_commit=self.args.no_commit, fname=self.args.file, + external=self.args.external, ) except DvcException: @@ -49,6 +50,12 @@ def add_parser(subparsers, parent_parser): default=False, help="Don't put files/directories into cache.", ) + parser.add_argument( + "--external", + action="store_true", + default=False, + help="Allow targets that are outside of the DVC repository.", + ) parser.add_argument( "-f", "--file", diff --git a/dvc/command/run.py b/dvc/command/run.py index 6f56e754e9..207a9c4486 100644 --- a/dvc/command/run.py +++ b/dvc/command/run.py @@ -53,6 +53,7 @@ def run(self): always_changed=self.args.always_changed, name=self.args.name, single_stage=self.args.single_stage, + external=self.args.external, ) except DvcException: logger.exception("") @@ -225,6 +226,12 @@ def add_parser(subparsers, parent_parser): default=False, help=argparse.SUPPRESS, ) + run_parser.add_argument( + "--external", + action="store_true", + default=False, + help="Allow outputs that are outside of the DVC repository.", + ) run_parser.add_argument( "command", nargs=argparse.REMAINDER, help="Command to execute." ) diff --git a/dvc/repo/add.py b/dvc/repo/add.py index 216e0cf0ed..26922aa961 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -21,7 +21,9 @@ @locked @scm_context -def add(repo, targets, recursive=False, no_commit=False, fname=None): +def add( + repo, targets, recursive=False, no_commit=False, fname=None, external=False +): if recursive and fname: raise RecursiveAddingWhileUsingFilename() @@ -52,7 +54,9 @@ def add(repo, targets, recursive=False, no_commit=False, fname=None): ) ) - stages = _create_stages(repo, sub_targets, fname, pbar=pbar) + stages = _create_stages( + repo, sub_targets, fname, pbar=pbar, external=external + ) try: repo.check_modified_graph(stages) @@ -120,7 +124,7 @@ def _find_all_targets(repo, target, recursive): return [target] -def _create_stages(repo, targets, fname, pbar=None): +def _create_stages(repo, targets, fname, pbar=None, external=False): from dvc.stage import Stage, create_stage stages = [] @@ -132,7 +136,14 @@ def _create_stages(repo, targets, fname, pbar=None): unit="file", ): path, wdir, out = resolve_paths(repo, out) - stage = create_stage(Stage, repo, fname or path, wdir=wdir, outs=[out]) + stage = create_stage( + Stage, + repo, + fname or path, + wdir=wdir, + outs=[out], + external=external, + ) if stage: Dvcfile(repo, stage.path).remove_with_prompt(force=True) diff --git a/dvc/stage/__init__.py b/dvc/stage/__init__.py index 725cb1e0e6..0728827e98 100644 --- a/dvc/stage/__init__.py +++ b/dvc/stage/__init__.py @@ -19,6 +19,7 @@ check_circular_dependency, check_duplicated_arguments, check_missing_outputs, + check_no_externals, check_stage_path, compute_md5, fill_stage_dependencies, @@ -52,7 +53,7 @@ def loads_from(cls, repo, path, wdir, data): return cls(**kw) -def create_stage(cls, repo, path, **kwargs): +def create_stage(cls, repo, path, external=False, **kwargs): from dvc.dvcfile import check_dvc_filename wdir = os.path.abspath(kwargs.get("wdir", None) or os.curdir) @@ -63,6 +64,8 @@ def create_stage(cls, repo, path, **kwargs): stage = loads_from(cls, repo, path, wdir, kwargs) fill_stage_outputs(stage, **kwargs) + if not external: + check_no_externals(stage) fill_stage_dependencies( stage, **project(kwargs, ["deps", "erepo", "params"]) ) diff --git a/dvc/stage/cache.py b/dvc/stage/cache.py index d94927d2ed..636736fddc 100644 --- a/dvc/stage/cache.py +++ b/dvc/stage/cache.py @@ -102,6 +102,7 @@ def _create_stage(self, cache, wdir=None): params=params, deps=[dep["path"] for dep in cache.get("deps", [])], outs=[out["path"] for out in cache["outs"]], + external=True, ) StageLoader.fill_from_lock(stage, cache) return stage diff --git a/dvc/stage/exceptions.py b/dvc/stage/exceptions.py index 6ad7c2f81e..eda6db716f 100644 --- a/dvc/stage/exceptions.py +++ b/dvc/stage/exceptions.py @@ -58,6 +58,10 @@ class StageCommitError(DvcException): pass +class StageExternalOutputsError(DvcException): + pass + + class StageUpdateError(DvcException): def __init__(self, path): super().__init__( diff --git a/dvc/stage/utils.py b/dvc/stage/utils.py index bf6cf86e6c..321f5088fe 100644 --- a/dvc/stage/utils.py +++ b/dvc/stage/utils.py @@ -6,9 +6,10 @@ from dvc.utils.fs import path_isin from ..remote import LocalRemote, S3Remote -from ..utils import dict_md5, relpath +from ..utils import dict_md5, format_link, relpath from .exceptions import ( MissingDataSource, + StageExternalOutputsError, StagePathNotDirectoryError, StagePathNotFoundError, StagePathOutsideError, @@ -68,6 +69,31 @@ def fill_stage_dependencies(stage, deps=None, erepo=None, params=None): stage.deps += dependency.loads_params(stage, params or []) +def check_no_externals(stage): + from urllib.parse import urlparse + + # NOTE: preventing users from accidentally using external outputs. See + # https://github.com/iterative/dvc/issues/1545 for more details. + + def _is_external(out): + # NOTE: in case of `remote://` notation, the user clearly knows that + # this is an advanced feature and so we shouldn't error-out. + if out.is_in_repo or urlparse(out.def_path).scheme == "remote": + return False + return True + + outs = [str(out) for out in stage.outs if _is_external(out)] + if not outs: + return + + str_outs = ", ".join(outs) + link = format_link("https://dvc.org/doc/user-guide/managing-external-data") + raise StageExternalOutputsError( + f"Output(s) outside of DVC project: {str_outs}. " + f"See {link} for more info." + ) + + def check_circular_dependency(stage): from dvc.exceptions import CircularDependencyError diff --git a/scripts/completion/dvc.bash b/scripts/completion/dvc.bash index 9917030ff2..3ed31bcfec 100644 --- a/scripts/completion/dvc.bash +++ b/scripts/completion/dvc.bash @@ -12,7 +12,7 @@ _dvc_commands='add cache checkout commit config dag destroy diff fetch freeze \ _dvc_options='-h --help -V --version' _dvc_global_options='-h --help -q --quiet -v --verbose' -_dvc_add='-R --recursive -f --file --no-commit' +_dvc_add='-R --recursive -f --file --no-commit --external' _dvc_add_COMPGEN=_dvc_compgen_files _dvc_cache='dir' _dvc_cache_dir=' --global --system --local -u --unset' @@ -65,7 +65,7 @@ _dvc_remove_COMPGEN=_dvc_compgen_DVCFiles _dvc_repro='-f --force -s --single-item -c --cwd -m --metrics --dry -i --interactive -p --pipeline -P --all-pipelines --no-run-cache --force-downstream --no-commit -R --recursive --downstream' _dvc_repro_COMPGEN=_dvc_compgen_DVCFiles _dvc_root='' -_dvc_run='--no-exec -f --file -d --deps -o --outs -O --outs-no-cache --outs-persist --outs-persist-no-cache -m --metrics -M --metrics-no-cache --overwrite-dvcfile --no-run-cache --no-commit -w --wdir' +_dvc_run='--no-exec -f --file -d --deps -o --outs -O --outs-no-cache --outs-persist --outs-persist-no-cache -m --metrics -M --metrics-no-cache --overwrite-dvcfile --no-run-cache --no-commit -w --wdir --external' _dvc_run_COMPGEN=_dvc_compgen_DVCFiles _dvc_status='-j --jobs -r --remote -a --all-branches -T --all-tags -d --with-deps -c --cloud' _dvc_status_COMPGEN=_dvc_compgen_DVCFiles diff --git a/scripts/completion/dvc.zsh b/scripts/completion/dvc.zsh index baacc5e54b..d37ce5426e 100644 --- a/scripts/completion/dvc.zsh +++ b/scripts/completion/dvc.zsh @@ -66,6 +66,7 @@ _dvc_add=( {-R,--recursive}"[Recursively add each file under the directory.]" "--no-commit[Don't put files/directories into cache.]" {-f,--file}"[Specify name of the DVC-file it generates.]:File:_files" + "--external[Allow targets that are outside of the DVC project.]" "1:File:_files" ) @@ -267,6 +268,7 @@ _dvc_run=( "--no-commit[Don't put files/directories into cache.]" "--outs-persist[Declare output file or directory that will not be removed upon repro.]:Output persistent:_files" "--outs-persist-no-cache[Declare output file or directory that will not be removed upon repro (do not put into DVC cache).]:Output persistent regular:_files" + "--external[Allow outputs that are outside of the DVC project.]" ) _dvc_status=( diff --git a/tests/func/test_add.py b/tests/func/test_add.py index fc4a249c3c..687a2c3065 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -187,11 +187,16 @@ def test_add_file_in_dir(tmp_dir, dvc): class TestAddExternalLocalFile(TestDvc): def test(self): + from dvc.stage.exceptions import StageExternalOutputsError + dname = TestDvc.mkdtemp() fname = os.path.join(dname, "foo") shutil.copyfile(self.FOO, fname) - stages = self.dvc.add(fname) + with self.assertRaises(StageExternalOutputsError): + self.dvc.add(fname) + + stages = self.dvc.add(fname, external=True) self.assertEqual(len(stages), 1) stage = stages[0] self.assertNotEqual(stage, None) diff --git a/tests/func/test_gc.py b/tests/func/test_gc.py index 5520893718..0685ed972d 100644 --- a/tests/func/test_gc.py +++ b/tests/func/test_gc.py @@ -153,7 +153,7 @@ def test(self): cwd = os.getcwd() os.chdir(self.additional_path) # ADD FILE ONLY IN SECOND PROJECT - fname = os.path.join(self.additional_path, "only_in_second") + fname = "only_in_second" with open(fname, "w+") as fobj: fobj.write("only in additional repo") @@ -161,7 +161,7 @@ def test(self): self.assertEqual(len(stages), 1) # ADD FILE IN SECOND PROJECT THAT IS ALSO IN MAIN PROJECT - fname = os.path.join(self.additional_path, "in_both") + fname = "in_both" with open(fname, "w+") as fobj: fobj.write("in both repos") diff --git a/tests/func/test_get.py b/tests/func/test_get.py index 515dba1366..232a0d6abb 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -101,7 +101,8 @@ def test_get_full_dvc_path(tmp_dir, erepo_dir, tmp_path_factory): external_data.write_text("ext_data") with erepo_dir.chdir(): - erepo_dir.dvc_add(os.fspath(external_data), commit="add external data") + erepo_dir.dvc.add(os.fspath(external_data), external=True) + erepo_dir.scm_add("ext_data.dvc", commit="add external data") Repo.get( os.fspath(erepo_dir), os.fspath(external_data), "ext_data_imported" diff --git a/tests/func/test_repro.py b/tests/func/test_repro.py index 7581d16e8e..ef677b13eb 100644 --- a/tests/func/test_repro.py +++ b/tests/func/test_repro.py @@ -963,6 +963,7 @@ def test(self, mock_prompt): deps=[out_foo_path], cmd=self.cmd(foo_path, bar_path), name="external-base", + external=True, ) self.assertEqual(self.dvc.status([cmd_stage.addressing]), {}) diff --git a/tests/unit/command/test_run.py b/tests/unit/command/test_run.py index ad6e9d921f..26510cdd0f 100644 --- a/tests/unit/command/test_run.py +++ b/tests/unit/command/test_run.py @@ -39,6 +39,7 @@ def test_run(mocker, dvc): "file:param1,param2", "--params", "param3", + "--external", "command", ] ) @@ -70,6 +71,7 @@ def test_run(mocker, dvc): cmd="command", name="nam", single_stage=False, + external=True, ) @@ -99,6 +101,7 @@ def test_run_args_from_cli(mocker, dvc): cmd="echo foo", name=None, single_stage=False, + external=False, ) @@ -128,4 +131,5 @@ def test_run_args_with_spaces(mocker, dvc): cmd='echo "foo bar"', name=None, single_stage=False, + external=False, ) diff --git a/tests/unit/stage/test_stage.py b/tests/unit/stage/test_stage.py index 3af3c05aaa..3da30f23a0 100644 --- a/tests/unit/stage/test_stage.py +++ b/tests/unit/stage/test_stage.py @@ -102,3 +102,21 @@ def test_always_changed(dvc): with dvc.lock: assert stage.changed() assert stage.status()["path"] == ["always changed"] + + +def test_external_outs(tmp_path_factory, dvc): + from dvc.stage import create_stage + from dvc.stage.exceptions import StageExternalOutputsError + + tmp_path = tmp_path_factory.mktemp("external-outs") + foo = tmp_path / "foo" + foo.write_text("foo") + + with pytest.raises(StageExternalOutputsError): + create_stage(Stage, dvc, "path.dvc", outs=[os.fspath(foo)]) + + with dvc.config.edit() as conf: + conf["remote"]["myremote"] = {"url": os.fspath(tmp_path)} + + create_stage(Stage, dvc, "path.dvc", outs=["remote://myremote/foo"]) + create_stage(Stage, dvc, "path.dvc", outs=[os.fspath(foo)], external=True)