Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] gc: change option to use single remove flags #3207

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 27 additions & 28 deletions dvc/command/gc.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,18 @@

class CmdGC(CmdBase):
def run(self):
msg = "This will remove all cache except items used in "

msg += "the working tree"
if self.args.all_commits:
msg += " and all git commits"
elif self.args.all_branches and self.args.all_tags:
msg += " and all git branches and tags"
elif self.args.all_branches:
msg += " and all git branches"
elif self.args.all_tags:
msg += " and all git tags"
msg = (
"This will remove all cache except items used in the working tree"
"{tags}{history}{branches}{history}"
).format(
tags="" if self.args.remove_all_tags else "and all git tags",
branches=""
if self.args.remove_all_branches
else "and all git branches",
history=""
if self.args.remove_all_history
else "and their history",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes the phrase weird - many ands. Maybe just list things instead:

This will remove all cache not referenced from:
- the working tree
- all branch heads
...

? Will make it more glanceable, the changing items will be always in the same place, any user already familiar with gc, will be simply skipping the entering phrase without loose of any meaning.

)

if self.args.repos:
msg += " of the current and the following repos:"
Expand All @@ -39,9 +40,9 @@ def run(self):
return 1

self.repo.gc(
all_branches=self.args.all_branches,
all_tags=self.args.all_tags,
all_commits=self.args.all_commits,
remove_all_tags=self.args.remove_all_tags,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opted for explicit flags to avoid confusion such as previous --remove=tags,commits(where it is not clear what commits are and why you are not able to pass specific tag or branch to it) and also to simplify completion.

remove_all_branches=self.args.remove_all_branches,
remove_all_history=self.args.remove_all_history,
cloud=self.args.cloud,
remote=self.args.remote,
force=self.args.force,
Expand All @@ -65,34 +66,32 @@ def add_parser(subparsers, parent_parser):
formatter_class=argparse.RawDescriptionHelpFormatter,
)
gc_parser.add_argument(
"-a",
"--all-branches",
"-c",
"--cloud",
action="store_true",
default=False,
help="Keep data files for the tips of all git branches.",
help="Collect garbage in remote repository.",
)
gc_parser.add_argument(
"-T",
"--all-tags",
action="store_true",
default=False,
help="Keep data files for all git tags.",
"-r", "--remote", help="Remote storage to collect garbage in."
)
gc_parser.add_argument(
skshetry marked this conversation as resolved.
Show resolved Hide resolved
"--all-commits",
"--remove-all-tags",
action="store_true",
default=False,
help=argparse.SUPPRESS,
help="Remove cache for all git tags.",
)
gc_parser.add_argument(
"-c",
"--cloud",
"--remove-all-branches",
action="store_true",
default=False,
help="Collect garbage in remote repository.",
help="Remove cache for all git branches.",
)
gc_parser.add_argument(
"-r", "--remote", help="Remote storage to collect garbage in."
"--remove-all-history",
action="store_true",
default=False,
help="Remove cache for all history of all branches and tags.",
Comment on lines +91 to +94
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I recall, your intention was to remove history by default. Why did you change your mind?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Suor To keep it as safe as possible for now πŸ™‚ And, well, I implemented a proper approach by excluding refs when not deleting full history.

)
gc_parser.add_argument(
"-f",
Expand Down
5 changes: 4 additions & 1 deletion dvc/repo/brancher.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@ def brancher( # noqa: E302
yield "working tree"

if all_commits:
revs = scm.list_all_commits()
revs = scm.list_all_commits(
exclude_all_tags=not all_tags,
exclude_all_branches=not all_branches,
)
Comment on lines +36 to +39
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes .brancher and .used_cache() interfaces lying ones, all_commits no longer means all commits.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This also makes dvc gc --remove-all-branches --remove-all-tags approaching noop for many scenarios, but not quite noop, a confusing situation. Suppose you make things iteratively and last commits only about some polish like README adjustments.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, the naming brancher is at fault here. It should really have been something like iter_commits() instead.

But --remove-all-branches --remove-all-tags is not a noop, it will exclude tags and branches from the refs for which it iterates through. Hence why I've mentioned an issue with refs/remotes and refs/stash etc. We should either exclude those by default or add an option to get rid of those. For now this is a "safe" approach, that can be revisited iteratively.

Copy link
Contributor

@Suor Suor Jan 28, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, it's even more confusing. You need a whole paragraph of text to explain how it works, but still it's usually just a noop.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Suor Well, that is because of refs/remotes and refs/stash. Will need to handle them somehow. To me seems like a good idea to just drop them by default and only keep refs/heads and refs/tags. What do you think?

else:
if all_branches:
revs.extend(scm.list_branches())
Expand Down
12 changes: 6 additions & 6 deletions dvc/repo/gc.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ def _do_gc(typ, func, clist):
@locked
def gc(
self,
all_branches=False,
remove_all_tags=False,
remove_all_branches=False,
remove_all_history=False,
cloud=False,
remote=None,
with_deps=False,
all_tags=False,
all_commits=False,
force=False,
jobs=None,
repos=None,
Expand All @@ -43,10 +43,10 @@ def gc(
for repo in all_repos + [self]:
used.update(
repo.used_cache(
all_branches=all_branches,
with_deps=with_deps,
all_tags=all_tags,
all_commits=all_commits,
all_branches=not remove_all_branches,
all_tags=not remove_all_tags,
all_commits=not remove_all_history,
remote=remote,
force=force,
jobs=jobs,
Expand Down
4 changes: 3 additions & 1 deletion dvc/scm/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,9 @@ def list_tags(self): # pylint: disable=no-self-use
"""Returns a list of available tags in the repo."""
return []

def list_all_commits(self): # pylint: disable=no-self-use
def list_all_commits(
self, exclude_all_tags=False, exclude_all_branches=False
): # pylint: disable=no-self-use
"""Returns a list of commits in the repo."""
return []

Expand Down
14 changes: 12 additions & 2 deletions dvc/scm/git/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,8 +244,18 @@ def list_branches(self):
def list_tags(self):
return [t.name for t in self.repo.tags]

def list_all_commits(self):
return [c.hexsha for c in self.repo.iter_commits("--all")]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was a misuse of iter_commits, where passing --all to it worked accidentally because it was passing it as rev to rev-list. New implementation uses rev-list directly, which should be significantly faster (no benches yet).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, previous implementation was silently keeping refs/remotes and refs/stash as well, which might or might not be what we want, considering that we don't currently have any means of deleting cache for those.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So it was previously simply "keep anything, which is referenced from git". Now it's more complicated )

def list_all_commits(
self, exclude_all_tags=False, exclude_all_branches=False
):
args = []
if exclude_all_tags:
args.extend(["--exclude", "refs/tags/*"])
if exclude_all_branches:
args.extend(["--exclude", "refs/heads/*"])
# NOTE: order matters, `--exclude` needs to be before `--all` in order
# to affect it.
args.append("--all")
return self.repo.git.rev_list(*args).splitlines()

def _install_hook(self, name, cmd):
command = (
Expand Down
2 changes: 1 addition & 1 deletion scripts/completion/dvc.bash
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ _dvc_diff='-t --target'
_dvc_fetch='-j --jobs -r --remote -a --all-branches -T --all-tags -d --with-deps -R --recursive $(compgen -G *.dvc)'
_dvc_get_url=''
_dvc_get='-o --out --rev --show-url'
_dvc_gc='-a --all-branches -T --all-tags -c --cloud -r --remote -f --force -p --projects -j --jobs'
_dvc_gc='--remove-all-branches --remove-all-tags --remove-all-history -c --cloud -r --remote -f --force -p --projects -j --jobs'
_dvc_import_url='-f --file'
_dvc_import='-o --out --rev'
_dvc_init='--no-scm -f --force'
Expand Down
4 changes: 2 additions & 2 deletions scripts/completion/dvc.zsh
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,8 @@ _dvc_get=(
)

_dvc_gc=(
{-a,--all-branches}"[Keep data files for the tips of all git branches.]"
{-T,--all-tags}"[Keep data files for all git tags.]"
"--remove-all-branches[Keep data files for the tips of all git branches.]"
"--remove-all-tags[Keep data files for all git tags.]"
{-c,--cloud}"[Collect garbage in remote repository.]"
{-r,--remote}"[Remote storage to collect garbage in.]:Remote repository:"
{-f,--force}"[Force garbage collection - automatically agree to all prompts.]:Repos:_files"
Expand Down
18 changes: 14 additions & 4 deletions tests/func/test_gc.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,16 +92,26 @@ def test(self):

self._check_cache(4)

self.dvc.gc(all_tags=True, all_branches=True)
self.dvc.gc()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will look into converting to pytest and possibly adding new tests soon.

self._check_cache(4)

self.dvc.gc(remove_all_history=True)
self._check_cache(3)

self.dvc.gc(all_tags=False, all_branches=True)
self.dvc.gc(remove_all_tags=True)
self._check_cache(3)

self.dvc.gc(remove_all_branches=True)
self._check_cache(2)

self.dvc.gc(all_tags=True, all_branches=False)
self.dvc.gc(remove_all_branches=True, remove_all_tags=True)
self._check_cache(2)

self.dvc.gc(
remove_all_branches=True,
remove_all_tags=True,
remove_all_history=True,
)
self._check_cache(1)


Expand Down Expand Up @@ -183,7 +193,7 @@ def test_all_commits(tmp_dir, scm, dvc):
tmp_dir.dvc_gen("testfile", "workspace")

n = _count_files(dvc.cache.local.cache_dir)
dvc.gc(all_commits=True)
dvc.gc()

# Only one uncommitted file should go away
assert _count_files(dvc.cache.local.cache_dir) == n - 1
Expand Down
39 changes: 39 additions & 0 deletions tests/unit/command/test_gc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from dvc.cli import parse_args
from dvc.command.gc import CmdGC


def test_gc(mocker):
cli_args = parse_args(
[
"gc",
"--remove-all-tags",
"--remove-all-branches",
"--remove-all-history",
"--cloud",
"--remote",
"myremote",
"--force",
"--jobs",
"10",
"--projects",
"/some/path",
"/some/other/path",
]
)
assert cli_args.func == CmdGC

cmd = cli_args.func(cli_args)
m = mocker.patch("dvc.repo.Repo.gc")

assert cmd.run() == 0

m.assert_called_once_with(
remove_all_tags=True,
remove_all_branches=True,
remove_all_history=True,
cloud=True,
remote="myremote",
force=True,
jobs=10,
repos=["/some/path", "/some/other/path"],
)
13 changes: 13 additions & 0 deletions tests/unit/scm/test_git.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,16 @@ def test_belongs_to_scm_true_on_git_internal(self):
def test_belongs_to_scm_false(self):
path = os.path.join("some", "non-.git", "file")
self.assertFalse(self.dvc.scm.belongs_to_scm(path))


def test_list_all_commits_detached_head(tmp_dir, scm):
tmp_dir.scm_gen({"first": "first"}, commit="first")
tmp_dir.scm_gen({"second": "second"}, commit="second")
scm.branch("branch_second")
scm.checkout("branch_third", create_new=True)
tmp_dir.scm_gen({"third": "third"}, commit="third")
scm.checkout("branch_second")
assert len(scm.list_all_commits()) == 3
# deleting the branch so that `third` commit gets lost
scm.repo.git.branch("branch_third", D=True)
assert len(scm.list_all_commits()) == 2