Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

import-url/update: add --no-download flag #8024

Merged
merged 3 commits into from
Aug 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion dvc/commands/data_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ def run(self):
)
self.log_summary(stats)
except (CheckoutError, DvcException) as exc:
self.log_summary(getattr(exc, "stats", {}))
if stats := getattr(exc, "stats", {}):
self.log_summary(stats)
logger.exception("failed to pull data from the cloud")
return 1

Expand Down
13 changes: 11 additions & 2 deletions dvc/commands/imp.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def run(self):
fname=self.args.file,
rev=self.args.rev,
no_exec=self.args.no_exec,
no_download=self.args.no_download,
desc=self.args.desc,
jobs=self.args.jobs,
)
Expand Down Expand Up @@ -69,11 +70,19 @@ def add_parser(subparsers, parent_parser):
help="Specify name of the .dvc file this command will generate.",
metavar="<filename>",
)
import_parser.add_argument(
no_download_exec_group = import_parser.add_mutually_exclusive_group()
no_download_exec_group.add_argument(
"--no-exec",
action="store_true",
default=False,
help="Only create .dvc file without actually downloading it.",
help="Only create .dvc file without actually importing target data.",
)
no_download_exec_group.add_argument(
"--no-download",
action="store_true",
default=False,
help="Create .dvc file including target data hash value(s)"
" but do not actually download the file(s).",
)
import_parser.add_argument(
"--desc",
Expand Down
13 changes: 11 additions & 2 deletions dvc/commands/imp_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def run(self):
out=self.args.out,
fname=self.args.file,
no_exec=self.args.no_exec,
no_download=self.args.no_download,
remote=self.args.remote,
to_remote=self.args.to_remote,
desc=self.args.desc,
Expand Down Expand Up @@ -66,11 +67,19 @@ def add_parser(subparsers, parent_parser):
help="Specify name of the .dvc file this command will generate.",
metavar="<filename>",
).complete = completion.DIR
import_parser.add_argument(
no_download_exec_group = import_parser.add_mutually_exclusive_group()
no_download_exec_group.add_argument(
"--no-exec",
action="store_true",
default=False,
help="Only create .dvc file without actually downloading it.",
help="Only create .dvc file without actually importing target data.",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
help="Only create .dvc file without actually importing target data.",
help="Only create .dvc file without downloading data or getting checksums.",

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thoughts? Does this help differentiate the options?

)
no_download_exec_group.add_argument(
"--no-download",
action="store_true",
default=False,
help="Create .dvc file including target data hash value(s)"
" but do not actually download the file(s).",
)
import_parser.add_argument(
"--to-remote",
Expand Down
8 changes: 8 additions & 0 deletions dvc/commands/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def run(self):
rev=self.args.rev,
recursive=self.args.recursive,
to_remote=self.args.to_remote,
no_download=self.args.no_download,
remote=self.args.remote,
jobs=self.args.jobs,
)
Expand Down Expand Up @@ -55,6 +56,13 @@ def add_parser(subparsers, parent_parser):
default=False,
help="Update all stages in the specified directory.",
)
update_parser.add_argument(
"--no-download",
action="store_true",
default=False,
help="Update .dvc file hash value(s)"
" but do not download the file(s).",
)
update_parser.add_argument(
"--to-remote",
action="store_true",
Expand Down
5 changes: 5 additions & 0 deletions dvc/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,11 @@ def __str__(self):

return self.fs.path.relpath(self.fs_path, self.repo.root_dir)

def clear(self):
self.hash_info = HashInfo.from_dict({})
self.meta = Meta.from_dict({})
self.obj = None

@property
def protocol(self):
return self.fs.protocol
Expand Down
41 changes: 40 additions & 1 deletion dvc/repo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from collections import defaultdict
from contextlib import contextmanager
from functools import wraps
from typing import TYPE_CHECKING, Callable, Optional
from typing import TYPE_CHECKING, Callable, List, Optional

from funcy import cached_property

Expand All @@ -18,6 +18,7 @@
from dvc.fs import FileSystem
from dvc.repo.scm_context import SCMContext
from dvc.scm import Base
from dvc.stage import Stage

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -444,6 +445,44 @@ def used_objs(

return used

def partial_imports(
self,
targets=None,
all_branches=False,
all_tags=False,
all_commits=False,
all_experiments=False,
commit_date: Optional[str] = None,
recursive=False,
revs=None,
num=1,
) -> List["Stage"]:
"""Get the stages related to the given target and collect dependencies
which are missing outputs.

This is useful to retrieve files which have been imported to the repo
using --no-download.

Returns:
A list of partially imported stages
"""
from itertools import chain

partial_imports = chain.from_iterable(
self.index.partial_imports(targets, recursive=recursive)
for _ in self.brancher(
Comment on lines +471 to +473
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One other thing to note is that this change makes us call brancher more than once on fetch - first to collect regular/cached used objects and then a second time to collect partial imports. This is expensive if we are fetching multiple revs (i.e. the user uses --all-commits).

Ideally we want to walk the repo once per revision, and collect everything we need in that one pass.

revs=revs,
all_branches=all_branches,
all_tags=all_tags,
all_commits=all_commits,
all_experiments=all_experiments,
commit_date=commit_date,
num=num,
)
)

return list(partial_imports)

@property
def stages(self): # obsolete, only for backward-compatibility
return self.index.stages
Expand Down
34 changes: 34 additions & 0 deletions dvc/repo/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,18 @@ def fetch(
downloaded += d
failed += f

d, f = _fetch_partial_imports(
self,
targets,
all_branches=all_branches,
all_tags=all_tags,
all_commits=all_commits,
recursive=recursive,
revs=revs,
)
downloaded += d
failed += f

if failed:
raise DownloadError(failed)

Expand All @@ -107,3 +119,25 @@ def _fetch(repo, obj_ids, **kwargs):
except FileTransferError as exc:
failed += exc.amount
return downloaded, failed


def _fetch_partial_imports(repo, targets, **kwargs):
from dvc.stage.exceptions import DataSourceChanged

downloaded = 0
failed = 0
for stage in repo.partial_imports(targets, **kwargs):
try:
stage.run()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using run()/sync_import() here means that this is always a pull and not really a fetch. The import will be downloaded into the workspace and then saved/committed to cache, but the output will remain "checked out" in the workspace, even if the user ran fetch.

in this workflow I would not expect file to be present in my workspace (it should only be in .dvc/cache to be checked out later):

$ dvc import-url "azure://test-container/dir/file" --no-download
Importing 'azure://test-container/dir/file' -> 'file'
$ ls
file.dvc  tags
$ dvc fetch
Importing 'azure://test-container/dir/file' -> 'file'
1 file fetched
$ ls
file  file.dvc  tags

except DataSourceChanged as exc:
logger.warning(f"{exc}")
failed += 1
continue
if not any(
kwargs.get(kw, None)
for kw in ("all_branches", "all_tags", "all_commits", "revs")
):
stage.dump()

downloaded += 1
return downloaded, failed
7 changes: 4 additions & 3 deletions dvc/repo/imp_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def imp_url(
fname=None,
erepo=None,
frozen=True,
no_download=False,
no_exec=False,
remote=None,
to_remote=False,
Expand All @@ -31,9 +32,9 @@ def imp_url(
self, out, always_local=to_remote and not out
)

if to_remote and no_exec:
if to_remote and (no_exec or no_download):
raise InvalidArgumentError(
"--no-exec can't be combined with --to-remote"
"--no-exec/--no-download cannot be combined with --to-remote"
)

if not to_remote and remote:
Expand Down Expand Up @@ -80,7 +81,7 @@ def imp_url(
stage.save_deps()
stage.md5 = stage.compute_md5()
else:
stage.run(jobs=jobs)
stage.run(jobs=jobs, no_download=no_download)

stage.frozen = frozen

Expand Down
22 changes: 22 additions & 0 deletions dvc/repo/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,28 @@ def used_objs(
used[odb].update(objs)
return used

def partial_imports(
self,
targets: "TargetType" = None,
recursive: bool = False,
) -> List["Stage"]:
from itertools import chain

from dvc.utils.collections import ensure_list

collect_targets: Sequence[Optional[str]] = (None,)
if targets:
collect_targets = ensure_list(targets)

pairs = chain.from_iterable(
self.stage_collector.collect_granular(
target, recursive=recursive, with_deps=True
)
for target in collect_targets
)

return [stage for stage, _ in pairs if stage.is_partial_import]

# Following methods help us treat the collection as a set-like structure
# and provides faux-immutability.
# These methods do not preserve stages order.
Expand Down
14 changes: 13 additions & 1 deletion dvc/repo/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def update(
rev=None,
recursive=False,
to_remote=False,
no_download=False,
remote=None,
jobs=None,
):
Expand All @@ -21,6 +22,11 @@ def update(
if isinstance(targets, str):
targets = [targets]

if to_remote and no_download:
raise InvalidArgumentError(
"--to-remote can't be used with --no-download"
)

if not to_remote and remote:
raise InvalidArgumentError(
"--remote can't be used without --to-remote"
Expand All @@ -31,7 +37,13 @@ def update(
stages.update(self.stage.collect(target, recursive=recursive))

for stage in stages:
stage.update(rev, to_remote=to_remote, remote=remote, jobs=jobs)
stage.update(
rev,
to_remote=to_remote,
remote=remote,
no_download=no_download,
jobs=jobs,
)
dvcfile = Dvcfile(self, stage.path)
dvcfile.dump(stage)

Expand Down
Loading