From 720d7244732e317bc99a61782047dd2d9d8451f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Mon, 28 Jun 2021 16:04:33 +0545 Subject: [PATCH 1/3] add: do not delete stage files before add Because of the way we collect stages and cache them, we were not able to collect them for the `add` without removing them from the workspace. As doing so, we'd have two same/similar stages - one collected from the workspace and the other just created from the `dvc add` in-memory. This would raise errors during graph checks, so we started to delete them and reset them (which is very recently, see #2886 and #3349). By deleting the file before we even do any checks, we are making DVC fragile, and results in data loss for the users with even simple mistakes. This should make it more reliable and robust. And, recently, we have started to keep state of a lot of things, that by resetting them on each stage, we waste a lot of performance, especially on gitignores. We cache the dulwich's IgnoreManager, which when resetted too many times, will waste a lot of our time just collecting them again next time (see #6227). It's hard to say how much this improves, as this very much depends on no. of gitignores in the repo (which can be assumed to be quite in number for a dvc repo) and the amount of files that we are adding (eg: `-R` adding a large directory). On a directory with 10,000 files (in a datadet-registry repo), creating stages on `dvc add -R` went from 64 files/sec to 1.1k files/sec. --- dvc/repo/add.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/dvc/repo/add.py b/dvc/repo/add.py index b7f09658bd..f27978f09e 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -98,8 +98,11 @@ def add( # noqa: C901 **kwargs, ) + # remove existing stages that are to-be replaced with these + # new stages for the graph checks. + old_stages = set(repo.stages) - set(stages) try: - repo.check_modified_graph(stages) + repo.check_modified_graph(stages, list(old_stages)) except OverlappingOutputPathsError as exc: msg = ( "Cannot add '{out}', because it is overlapping with other " @@ -233,7 +236,6 @@ def _create_stages( transfer=False, **kwargs, ): - from dvc.dvcfile import Dvcfile from dvc.stage import Stage, create_stage, restore_meta expanded_targets = glob_targets(targets, glob=glob) @@ -259,12 +261,9 @@ def _create_stages( external=external, ) restore_meta(stage) - Dvcfile(repo, stage.path).remove() if desc: stage.outs[0].desc = desc - repo._reset() # pylint: disable=protected-access - if not stage: if pbar is not None: pbar.total -= 1 From c4b5c5526ff6817e7b57f2cbcb9d561b6f117637 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Tue, 29 Jun 2021 12:06:15 +0545 Subject: [PATCH 2/3] add tests --- tests/func/test_add.py | 43 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/tests/func/test_add.py b/tests/func/test_add.py index b5559ddc3f..9de25e3c55 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -24,7 +24,11 @@ from dvc.hash_info import HashInfo from dvc.main import main from dvc.objects.db import ODBManager -from dvc.output import OutputAlreadyTrackedError, OutputIsStageFileError +from dvc.output import ( + OutputAlreadyTrackedError, + OutputDoesNotExistError, + OutputIsStageFileError, +) from dvc.stage import Stage from dvc.stage.exceptions import ( StageExternalOutputsError, @@ -1190,3 +1194,40 @@ def test_add_ignored(tmp_dir, scm, dvc): assert str(exc.value) == ("bad DVC file name '{}' is git-ignored.").format( os.path.join("dir", "subdir.dvc") ) + + +def test_add_on_not_existing_file_should_not_remove_stage_file(tmp_dir, dvc): + (stage,) = tmp_dir.dvc_gen("foo", "foo") + (tmp_dir / "foo").unlink() + dvcfile_contents = (tmp_dir / stage.path).read_text() + + with pytest.raises(OutputDoesNotExistError): + dvc.add("foo") + assert (tmp_dir / "foo.dvc").exists() + assert (tmp_dir / stage.path).read_text() == dvcfile_contents + + +@pytest.mark.parametrize( + "target", + [ + "dvc.repo.Repo.check_modified_graph", + "dvc.stage.Stage.save", + "dvc.stage.Stage.commit", + ], +) +def test_add_does_not_remove_stage_file_on_failure( + tmp_dir, dvc, mocker, target +): + (stage,) = tmp_dir.dvc_gen("foo", "foo") + tmp_dir.gen("foo", "foobar") # update file + dvcfile_contents = (tmp_dir / stage.path).read_text() + + mocker.patch( + target, + side_effect=DvcException(f"raising error from mocked '{target}'"), + ) + + with pytest.raises(DvcException): + dvc.add("foo") + assert (tmp_dir / "foo.dvc").exists() + assert (tmp_dir / stage.path).read_text() == dvcfile_contents From 63e14f760a238380bccf708f4a70d97fde876aaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Tue, 29 Jun 2021 12:07:46 +0545 Subject: [PATCH 3/3] make the test more specific --- tests/func/test_add.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/func/test_add.py b/tests/func/test_add.py index 9de25e3c55..071a2f7531 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -1222,12 +1222,14 @@ def test_add_does_not_remove_stage_file_on_failure( tmp_dir.gen("foo", "foobar") # update file dvcfile_contents = (tmp_dir / stage.path).read_text() + exc_msg = f"raising error from mocked '{target}'" mocker.patch( target, - side_effect=DvcException(f"raising error from mocked '{target}'"), + side_effect=DvcException(exc_msg), ) - with pytest.raises(DvcException): + with pytest.raises(DvcException) as exc_info: dvc.add("foo") + assert str(exc_info.value) == exc_msg assert (tmp_dir / "foo.dvc").exists() assert (tmp_dir / stage.path).read_text() == dvcfile_contents