Skip to content

Commit

Permalink
dvc: drop cached external outputs
Browse files Browse the repository at this point in the history
  • Loading branch information
efiop committed Jun 9, 2023
1 parent 262419e commit cf233e0
Show file tree
Hide file tree
Showing 20 changed files with 70 additions and 312 deletions.
11 changes: 0 additions & 11 deletions dvc/cachemgr.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,6 @@ def _get_odb(

class CacheManager:
CACHE_DIR = "cache"
CLOUD_SCHEMES = [
Schemes.S3,
Schemes.GS,
Schemes.SSH,
Schemes.HDFS,
Schemes.WEBHDFS,
]
FILES_DIR = "files"

def __init__(self, repo):
Expand Down Expand Up @@ -91,16 +84,12 @@ def _init_odb(self, schemes):
)

def __getattr__(self, name):
if name not in self._odb and name in self.CLOUD_SCHEMES:
self._init_odb([name])

try:
return self._odb[name]
except KeyError as exc:
raise AttributeError from exc

def by_scheme(self):
self._init_odb(self.CLOUD_SCHEMES)
yield from self._odb.items()

@property
Expand Down
9 changes: 0 additions & 9 deletions dvc/commands/add.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@ def validate_args(self) -> None:
invalid_opt = "--glob option"
elif args.no_commit:
invalid_opt = "--no-commit option"
elif args.external:
invalid_opt = "--external option"
else:
message = "{option} can't be used without --to-remote"
if args.remote:
Expand All @@ -49,7 +47,6 @@ def run(self):
self.repo.add(
self.args.targets,
no_commit=self.args.no_commit,
external=self.args.external,
glob=self.args.glob,
out=self.args.out,
remote=self.args.remote,
Expand Down Expand Up @@ -82,12 +79,6 @@ def add_parser(subparsers, parent_parser):
default=False,
help="Don't put files/directories into cache.",
)
parser.add_argument(
"--external",
action="store_true",
default=False,
help="Allow targets that are outside of the DVC repository.",
)
parser.add_argument(
"--glob",
action="store_true",
Expand Down
14 changes: 7 additions & 7 deletions dvc/config_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,13 +154,13 @@ def __call__(self, data):
"machine": Lower,
},
"cache": {
"local": str,
"s3": str,
"gs": str,
"hdfs": str,
"webhdfs": str,
"ssh": str,
"azure": str,
"local": str, # obsoleted
"s3": str, # obsoleted
"gs": str, # obsoleted
"hdfs": str, # obsoleted
"webhdfs": str, # obsoleted
"ssh": str, # obsoleted
"azure": str, # obsoleted
# This is for default local cache
"dir": str,
**LOCAL_COMMON,
Expand Down
15 changes: 0 additions & 15 deletions dvc/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,21 +299,6 @@ def __init__(self, url):
super().__init__(f"The path '{url}' does not exist")


class RemoteCacheRequiredError(DvcException):
def __init__(self, scheme, fs_path):
super().__init__(
(
"Current operation was unsuccessful because '{}' requires "
"existing cache on '{}' remote. See {} for information on how "
"to set up remote cache."
).format(
fs_path,
scheme,
format_link("https://man.dvc.org/config#cache"),
)
)


class IsADirectoryError(DvcException): # noqa,pylint:disable=redefined-builtin
"""Raised when a file operation is requested on a directory."""

Expand Down
12 changes: 3 additions & 9 deletions dvc/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
ConfirmRemoveError,
DvcException,
MergeError,
RemoteCacheRequiredError,
)
from dvc.utils.objects import cached_property
from dvc_data.hashfile import check as ocheck
Expand Down Expand Up @@ -526,14 +525,9 @@ def use_scm_ignore(self):
def cache(self):
from dvc.cachemgr import LEGACY_HASH_NAMES

if self.is_in_repo:
odb_name = "legacy" if self.hash_name in LEGACY_HASH_NAMES else "repo"
else:
odb_name = self.protocol
odb = getattr(self.repo.cache, odb_name)
if self.use_cache and odb is None:
raise RemoteCacheRequiredError(self.fs.protocol, self.fs_path)
return odb
assert self.is_in_repo
odb_name = "legacy" if self.hash_name in LEGACY_HASH_NAMES else "repo"
return getattr(self.repo.cache, odb_name)

@property
def local_cache(self):
Expand Down
4 changes: 0 additions & 4 deletions dvc/repo/add.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ def find_targets(
def get_or_create_stage(
repo: "Repo",
target: str,
external: bool = False,
out: Optional[str] = None,
to_remote: bool = False,
force: bool = False,
Expand All @@ -79,7 +78,6 @@ def get_or_create_stage(
fname=path,
wdir=wdir,
outs=[out],
external=external,
force=force,
)
return StageInfo(stage, output_exists=False)
Expand Down Expand Up @@ -199,7 +197,6 @@ def add(
repo: "Repo",
targets: Union["StrOrBytesPath", Iterator["StrOrBytesPath"]],
no_commit: bool = False,
external: bool = False,
glob: bool = False,
out: Optional[str] = None,
remote: Optional[str] = None,
Expand All @@ -215,7 +212,6 @@ def add(
target: get_or_create_stage(
repo,
target,
external=external,
out=out,
to_remote=to_remote,
force=force,
Expand Down
5 changes: 2 additions & 3 deletions dvc/stage/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ class RawData:
generated_from: Optional[str] = None


def create_stage(cls: Type[_T], repo, path, external=False, **kwargs) -> _T:
def create_stage(cls: Type[_T], repo, path, **kwargs) -> _T:
from dvc.dvcfile import check_dvcfile_path

wdir = os.path.abspath(kwargs.get("wdir", None) or os.curdir)
Expand All @@ -101,8 +101,7 @@ def create_stage(cls: Type[_T], repo, path, external=False, **kwargs) -> _T:

stage = loads_from(cls, repo, path, wdir, kwargs)
fill_stage_outputs(stage, **kwargs)
if not external:
check_no_externals(stage)
check_no_externals(stage)
fill_stage_dependencies(
stage, **project(kwargs, ["deps", "erepo", "params", "fs_config"])
)
Expand Down
10 changes: 7 additions & 3 deletions dvc/stage/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,12 @@ def _can_hash(stage):
return False

for out in stage.outs:
if out.protocol != "local" or not out.def_path or out.persist:
if (
out.protocol != "local"
or not out.def_path
or out.persist
or not out.is_in_repo
):
return False

return True
Expand Down Expand Up @@ -112,7 +117,6 @@ def _create_stage(self, cache, wdir=None):
cmd=cache["cmd"],
wdir=wdir,
outs=[out["path"] for out in cache["outs"]],
external=True,
)
StageLoader.fill_from_lock(stage, cache)
return stage
Expand All @@ -137,7 +141,7 @@ def _uncached_outs(self, stage, cache):
# NOTE: using copy link to make it look like a git-tracked file
with self._cache_type_copy():
for out in cached_stage.outs:
if out.def_path in outs_no_cache:
if out.def_path in outs_no_cache and out.is_in_repo:
yield out

def save(self, stage):
Expand Down
20 changes: 8 additions & 12 deletions dvc/stage/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,28 +87,24 @@ def fill_stage_dependencies(stage, deps=None, erepo=None, params=None, fs_config


def check_no_externals(stage):
from urllib.parse import urlparse

from dvc.utils import format_link

# NOTE: preventing users from accidentally using external outputs. See
# https://github.com/iterative/dvc/issues/1545 for more details.

def _is_external(out):
# NOTE: in case of `remote://` notation, the user clearly knows that
# this is an advanced feature and so we shouldn't error-out.
if out.is_in_repo or urlparse(out.def_path).scheme == "remote":
def _is_cached_external(out):
if out.is_in_repo or not out.use_cache:
return False
return True

outs = [str(out) for out in stage.outs if _is_external(out)]
outs = [str(out) for out in stage.outs if _is_cached_external(out)]
if not outs:
return

str_outs = ", ".join(outs)
link = format_link("https://dvc.org/doc/user-guide/managing-external-data")
link = format_link(
"https://dvc.org/doc/user-guide/pipelines/external-dependencies-and-outputs"
)
raise StageExternalOutputsError(
f"Output(s) outside of DVC project: {str_outs}. See {link} for more info."
f"Cached output(s) outside of DVC project: {str_outs}. "
f"See {link} for more info."
)


Expand Down
51 changes: 0 additions & 51 deletions dvc/testing/workspace_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,57 +153,6 @@ def test_import_no_download(self, tmp_dir, dvc, remote_version_aware):
assert dvc.status() == {}


class TestAdd:
@pytest.fixture
def hash_name(self):
pytest.skip()

@pytest.fixture
def hash_value(self):
pytest.skip()

@pytest.fixture
def dir_hash_value(self):
pytest.skip()

def test_add(self, tmp_dir, dvc, workspace, hash_name, hash_value):
from dvc.stage.exceptions import StageExternalOutputsError

workspace.gen("file", "file")

with pytest.raises(StageExternalOutputsError):
dvc.add(workspace.url)

dvc.add("remote://workspace/file")
assert (tmp_dir / "file.dvc").read_text() == (
"outs:\n"
f"- {hash_name}: {hash_value}\n"
" size: 4\n"
" hash: md5\n"
" path: remote://workspace/file\n"
)
assert (workspace / "file").read_text() == "file"
assert (
workspace / "cache" / "files" / "md5" / hash_value[:2] / hash_value[2:]
).read_text() == "file"

assert dvc.status() == {}

# pylint: disable-next=unused-argument
def test_add_dir(self, tmp_dir, dvc, workspace, hash_name, dir_hash_value):
workspace.gen({"dir": {"file": "file", "subdir": {"subfile": "subfile"}}})

dvc.add("remote://workspace/dir")
assert (
workspace
/ "cache"
/ "files"
/ "md5"
/ dir_hash_value[:2]
/ dir_hash_value[2:]
).is_file()


def match_files(fs, entries, expected):
entries_content = {(fs.path.normpath(d["path"]), d["isdir"]) for d in entries}
expected_content = {(fs.path.normpath(d["path"]), d["isdir"]) for d in expected}
Expand Down
Loading

0 comments on commit cf233e0

Please sign in to comment.