Skip to content

Commit

Permalink
stage collection: do not collect .gitignored dvcfiles (#5265)
Browse files Browse the repository at this point in the history
* Do not collect .gitignored dvcfiles

* fix exc

* consistent error message

* fix imports

* add tests
  • Loading branch information
skshetry authored Jan 26, 2021
1 parent a5e19ec commit b1d6918
Show file tree
Hide file tree
Showing 5 changed files with 134 additions and 11 deletions.
20 changes: 18 additions & 2 deletions dvc/dvcfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@
PIPELINE_LOCK = "dvc.lock"


class FileIsGitIgnored(DvcException):
def __init__(self, path):
super().__init__(f"'{path}' is git-ignored.")


class LockfileCorruptedError(DvcException):
pass

Expand Down Expand Up @@ -103,18 +108,29 @@ def relpath(self):
def exists(self):
return self.repo.tree.exists(self.path)

def _is_git_ignored(self):
from dvc.tree.local import LocalTree

return isinstance(
self.repo.tree, LocalTree
) and self.repo.scm.is_ignored(self.path)

def _load(self):
# it raises the proper exceptions by priority:
# 1. when the file doesn't exists
# 2. filename is not a DVC-file
# 3. path doesn't represent a regular file
# 4. when the file is git ignored
if not self.exists():
raise StageFileDoesNotExistError(self.path)
is_ignored = self.repo.tree.exists(self.path, use_dvcignore=False)
raise StageFileDoesNotExistError(self.path, dvc_ignored=is_ignored)

if self.verify:
check_dvc_filename(self.path)
if not self.repo.tree.isfile(self.path):
raise StageFileIsNotDvcFileError(self.path)

if self._is_git_ignored():
raise FileIsGitIgnored(self.path)
with self.repo.tree.open(self.path) as fd:
stage_text = fd.read()
d = parse_yaml(stage_text, self.path)
Expand Down
29 changes: 25 additions & 4 deletions dvc/repo/stage.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import typing
from contextlib import suppress
from functools import partial
from typing import (
Callable,
Iterable,
Expand Down Expand Up @@ -371,13 +372,33 @@ def collect_repo(self, onerror: Callable[[str, Exception], None] = None):
the collection.
"""
from dvc.dvcfile import is_valid_filename
from dvc.tree.local import LocalTree

stages = []
scm = self.repo.scm
sep = os.sep
outs: Set[str] = set()

is_local_tree = isinstance(self.tree, LocalTree)

def is_ignored(path):
# apply only for the local tree
return is_local_tree and scm.is_ignored(path)

def is_dvcfile_and_not_ignored(root, file):
return is_valid_filename(file) and not is_ignored(
f"{root}{sep}{file}"
)

def is_out_or_ignored(root, directory):
dir_path = f"{root}{sep}{directory}"
return dir_path in outs or is_ignored(dir_path)

stages = []
for root, dirs, files in self.tree.walk(self.repo.root_dir):
for file_name in filter(is_valid_filename, files):
file_path = os.path.join(root, file_name)
dvcfile_filter = partial(is_dvcfile_and_not_ignored, root)

for file in filter(dvcfile_filter, files):
file_path = os.path.join(root, file)
try:
new_stages = self.load_file(file_path)
except DvcException as exc:
Expand All @@ -393,5 +414,5 @@ def collect_repo(self, onerror: Callable[[str, Exception], None] = None):
for out in stage.outs
if out.scheme == "local"
)
dirs[:] = [d for d in dirs if os.path.join(root, d) not in outs]
dirs[:] = [d for d in dirs if not is_out_or_ignored(root, d)]
return stages
8 changes: 6 additions & 2 deletions dvc/stage/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,13 @@ class StageFileFormatError(DvcException):


class StageFileDoesNotExistError(DvcException):
def __init__(self, fname):
DVC_IGNORED = "is dvc-ignored"
DOES_NOT_EXIST = "does not exist"

def __init__(self, fname, dvc_ignored=False):
self.file = fname
super().__init__(f"'{self.file}' does not exist.")
message = self.DVC_IGNORED if dvc_ignored else self.DOES_NOT_EXIST
super().__init__(f"'{self.file}' {message}")


class StageFileAlreadyExistsError(DvcException):
Expand Down
44 changes: 43 additions & 1 deletion tests/func/test_stage_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from funcy import raiser

from dvc.dvcfile import PIPELINE_FILE
from dvc.dvcfile import PIPELINE_FILE, FileIsGitIgnored
from dvc.exceptions import NoOutputOrStageError
from dvc.path_info import PathInfo
from dvc.repo import Repo
Expand Down Expand Up @@ -451,3 +451,45 @@ def test_collect_repo_callback(tmp_dir, dvc, mocker):
file_path, exc = mock.call_args[0]
assert file_path == PIPELINE_FILE
assert isinstance(exc, StageFileFormatError)


def test_gitignored_collect_repo(tmp_dir, dvc, scm):
(stage,) = tmp_dir.dvc_gen({"data": {"foo": "foo", "bar": "bar"}})

assert dvc.stage.collect_repo() == [stage]

scm.ignore(stage.path)
scm._reset()

assert not dvc.stage.collect_repo()


def test_gitignored_file_try_collect_granular_for_data_files(
tmp_dir, dvc, scm
):
(stage,) = tmp_dir.dvc_gen({"data": {"foo": "foo", "bar": "bar"}})
path = PathInfo("data") / "foo"

assert dvc.stage.collect_granular(str(path)) == [
(stage, PathInfo(tmp_dir / path))
]

scm.ignore(stage.path)
dvc._reset()

with pytest.raises(NoOutputOrStageError):
dvc.stage.collect_granular(str(path))


def test_gitignored_file_try_collect_granular_for_dvc_yaml_files(
tmp_dir, dvc, scm, stages
):
assert dvc.stage.collect_granular("bar") == [
(stages["copy-foo-bar"], PathInfo(tmp_dir / "bar"))
]

scm.ignore(tmp_dir / "dvc.yaml")
scm._reset()

with pytest.raises(FileIsGitIgnored):
dvc.stage.collect_granular("bar")
44 changes: 42 additions & 2 deletions tests/unit/test_dvcfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
PIPELINE_FILE,
PIPELINE_LOCK,
Dvcfile,
FileIsGitIgnored,
PipelineFile,
SingleStageFile,
)
Expand Down Expand Up @@ -41,12 +42,23 @@ def test_pipelines_single_stage_file(path):


@pytest.mark.parametrize("file", ["stage.dvc", "dvc.yaml"])
def test_stage_load_on_not_existing_file(tmp_dir, dvc, file):
@pytest.mark.parametrize("is_dvcignored", [True, False])
def test_stage_load_on_not_existing_file(tmp_dir, dvc, file, is_dvcignored):
dvcfile = Dvcfile(dvc, file)
if is_dvcignored:
(tmp_dir / ".dvcignore").write_text(file)

assert not dvcfile.exists()
with pytest.raises(StageFileDoesNotExistError):
with pytest.raises(StageFileDoesNotExistError) as exc_info:
assert dvcfile.stages.values()

assert str(exc_info.value) == f"'{file}' does not exist"


@pytest.mark.parametrize("file", ["stage.dvc", "dvc.yaml"])
def test_stage_load_on_non_file(tmp_dir, dvc, file):
(tmp_dir / file).mkdir()
dvcfile = Dvcfile(dvc, file)
with pytest.raises(StageFileIsNotDvcFileError):
assert dvcfile.stages.values()

Expand Down Expand Up @@ -83,3 +95,31 @@ def test_dump_stage(tmp_dir, dvc):
assert (tmp_dir / PIPELINE_FILE).exists()
assert (tmp_dir / PIPELINE_LOCK).exists()
assert list(dvcfile.stages.values()) == [stage]


@pytest.mark.parametrize("file", ["stage.dvc", "dvc.yaml"])
def test_stage_load_file_exists_but_dvcignored(tmp_dir, dvc, scm, file):
(tmp_dir / file).write_text("")
(tmp_dir / ".dvcignore").write_text(file)

dvcfile = Dvcfile(dvc, file)
with pytest.raises(StageFileDoesNotExistError) as exc_info:
assert dvcfile.stages.values()

assert str(exc_info.value) == f"'{file}' is dvc-ignored"


@pytest.mark.parametrize("file", ["foo.dvc", "dvc.yaml"])
def test_try_loading_dvcfile_that_is_gitignored(tmp_dir, dvc, scm, file):
with open(tmp_dir / ".gitignore", "a+") as fd:
fd.write(file)

# create a file just to avoid other checks
(tmp_dir / file).write_text("")
scm._reset()

dvcfile = Dvcfile(dvc, file)
with pytest.raises(FileIsGitIgnored) as exc_info:
dvcfile._load()

assert str(exc_info.value) == f"'{file}' is git-ignored."

0 comments on commit b1d6918

Please sign in to comment.