Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

repro: tests: migrate to dir helpers #3091

Merged
merged 4 commits into from
Jan 10, 2020
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
268 changes: 129 additions & 139 deletions tests/func/test_repro.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
import getpass
import os
import posixpath
import pathlib
import re
import shutil
import uuid
from pathlib import Path
from subprocess import PIPE
from subprocess import Popen
from urllib.parse import urljoin
Expand All @@ -17,6 +17,7 @@
from google.cloud import storage as gc
from mock import patch

from dvc.compat import fspath
from dvc.exceptions import CyclicGraphError
from dvc.exceptions import ReproductionError
from dvc.exceptions import StagePathAsOutputError
Expand Down Expand Up @@ -1372,210 +1373,199 @@ def test(self):


@pytest.fixture
def repro_dir(dvc_repo, repo_dir):
repo_dir.dname = "dir"
os.mkdir(repo_dir.dname)
repo_dir.emptydname = "emptydir"
os.mkdir(repo_dir.emptydname)
subdname = os.path.join(repo_dir.dname, "subdir")
os.mkdir(subdname)

repo_dir.source = "source"
repo_dir.source_stage = repo_dir.source + ".dvc"
stage = dvc_repo.run(
fname=repo_dir.source_stage,
outs=[repo_dir.source],
deps=[repo_dir.FOO],
)
assert stage is not None
assert filecmp.cmp(repo_dir.source, repo_dir.FOO, shallow=False)

repo_dir.unrelated1 = "unrelated1"
repo_dir.unrelated1_stage = repo_dir.unrelated1 + ".dvc"
stage = dvc_repo.run(
fname=repo_dir.unrelated1_stage,
outs=[repo_dir.unrelated1],
deps=[repo_dir.source],
)
assert stage is not None

repo_dir.unrelated2 = "unrelated2"
repo_dir.unrelated2_stage = repo_dir.unrelated2 + ".dvc"
stage = dvc_repo.run(
fname=repo_dir.unrelated2_stage,
outs=[repo_dir.unrelated2],
deps=[repo_dir.DATA],
def repro_dir(tmp_dir, dvc, run_copy):
# Creates repo with following structure:
# data_dir/dir_file origin_data
# | | |
# | | origin_copy.dvc
# unrelated2.dvc | | |
# | | unrelated1.dvc
# dir/subdir/dir_file_copy.dvc |
# | |
# | dir/origin_copy_2.dvc
# | |
# \ /
# \ /
# dir/Dvcfile
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could've used tree CLI util, it would be much easier to understand than graph πŸ™‚

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I intended to show dependency between particular stages, tree would show placement in the directory. I thought that this one brings more information. Maybe I should include both?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, right, sorry. Ok, why not dvc pipeline show --ascii then? Or is it too big?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

^ Just wondering, I'm okay with this illustration.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@efiop, you are right, it was too big, had to to it my own way, so that we will still be able to pass all checks :)

tmp_dir.gen(
{
"origin_data": "origin data content",
"data_dir": {"dir_file": "dir file content"},
"dir": {"subdir": {}},
}
)
assert stage is not None

repo_dir.first = os.path.join(repo_dir.dname, "first")
repo_dir.first_stage = repo_dir.first + ".dvc"

stage = dvc_repo.run(
fname=repo_dir.first_stage,
deps=[repo_dir.source],
outs=[repo_dir.first],
cmd="python {} {} {}".format(
repo_dir.CODE, repo_dir.source, repo_dir.first
),
tmp_dir.origin_copy = "origin_copy"
assert run_copy("origin_data", tmp_dir.origin_copy) is not None
assert (tmp_dir / tmp_dir.origin_copy).read_text() == "origin data content"

# Unrelated are to verify that reproducing `dir` will not trigger them too
assert run_copy(tmp_dir.origin_copy, "unrelated1") is not None
dir_file_path = tmp_dir / "data_dir" / "dir_file"
assert run_copy(fspath(dir_file_path), "unrelated2") is not None

tmp_dir.origin_copy_2 = os.path.join("dir", "origin_copy_2")
assert (
run_copy(
tmp_dir.origin_copy,
tmp_dir.origin_copy_2,
fname=tmp_dir.origin_copy_2 + ".dvc",
)
is not None
)
assert stage is not None
assert filecmp.cmp(repo_dir.first, repo_dir.FOO, shallow=False)

repo_dir.second = os.path.join(subdname, "second")
repo_dir.second_stage = repo_dir.second + ".dvc"
stage = dvc_repo.run(
fname=repo_dir.second_stage,
outs=[repo_dir.second],
deps=[repo_dir.DATA],
assert (
tmp_dir / tmp_dir.origin_copy_2
).read_text() == "origin data content"

tmp_dir.dir_file_copy = os.path.join("dir", "subdir", "dir_file_copy")
assert (
run_copy(
fspath(dir_file_path),
tmp_dir.dir_file_copy,
fname=tmp_dir.dir_file_copy + ".dvc",
)
is not None
)
assert stage is not None
assert filecmp.cmp(repo_dir.second, repo_dir.DATA, shallow=False)
assert (tmp_dir / tmp_dir.dir_file_copy).read_text() == "dir file content"

repo_dir.third_stage = os.path.join(repo_dir.dname, "Dvcfile")
stage = dvc_repo.run(
fname=repo_dir.third_stage, deps=[repo_dir.first, repo_dir.second]
tmp_dir.last_stage = os.path.join("dir", "Dvcfile")
assert (
dvc.run(
fname=tmp_dir.last_stage,
deps=[tmp_dir.origin_copy_2, tmp_dir.dir_file_copy],
)
is not None
)
assert stage is not None

yield repo_dir
yield tmp_dir


def test_recursive_repro_default(dvc_repo, repro_dir):
def test_recursive_repro_default(dvc, repro_dir):
"""
Test recursive repro on dir after a dep outside this dir has changed.
"""
os.unlink(repro_dir.FOO)
shutil.copyfile(repro_dir.BAR, repro_dir.FOO)
origin = Path("origin_data")
origin.unlink()
origin.write_text("new origin data content")

stages = dvc.reproduce("dir", recursive=True)

stages = dvc_repo.reproduce(repro_dir.dname, recursive=True)
# Check that the dependency ("source") and the dependent stages
# inside the folder have been reproduced ("first", "third")
assert len(stages) == 3
names = [stage.relpath for stage in stages]
assert repro_dir.source_stage in names
assert repro_dir.first_stage in names
assert repro_dir.third_stage in names
assert filecmp.cmp(repro_dir.source, repro_dir.BAR, shallow=False)
assert filecmp.cmp(repro_dir.first, repro_dir.BAR, shallow=False)
names = [s.relpath for s in stages]
assert len(names) == 3
assert set(names) == {
repro_dir.origin_copy + ".dvc",
repro_dir.origin_copy_2 + ".dvc",
repro_dir.last_stage,
}
assert Path(repro_dir.origin_copy).read_text() == "new origin data content"
assert (
Path(repro_dir.origin_copy_2).read_text() == "new origin data content"
)


def test_recursive_repro_single(dvc_repo, repro_dir):
def test_recursive_repro_single(dvc, repro_dir):
"""
Test recursive single-item repro on dir
after a dep outside this dir has changed.
"""
os.unlink(repro_dir.FOO)
shutil.copyfile(repro_dir.BAR, repro_dir.FOO)
origin_data = Path("origin_data")
origin_data.unlink()
origin_data.write_text("new origin content")

os.unlink(repro_dir.DATA)
shutil.copyfile(repro_dir.BAR, repro_dir.DATA)
dir_file = repro_dir / "data_dir" / "dir_file"
dir_file.unlink()
dir_file.write_text("new dir file content")

stages = dvc_repo.reproduce(
repro_dir.dname, recursive=True, single_item=True
)
stages = dvc.reproduce("dir", recursive=True, single_item=True)
# Check that just stages inside given dir
# with changed direct deps have been reproduced.
# This means that "first" stage should not be reproduced
# since it depends on "source".
# Also check that "second" stage was reproduced before "third" stage
# Also check that "dir_file_copy" stage was reproduced before "third" stage
assert len(stages) == 2
assert repro_dir.second_stage == stages[0].relpath
assert repro_dir.third_stage == stages[1].relpath
assert filecmp.cmp(repro_dir.second, repro_dir.BAR, shallow=False)
assert [s.relpath for s in stages] == [
repro_dir.dir_file_copy + ".dvc",
repro_dir.last_stage,
]
assert Path(repro_dir.dir_file_copy).read_text() == "new dir file content"


def test_recursive_repro_single_force(dvc_repo, repro_dir):
def test_recursive_repro_single_force(dvc, repro_dir):
"""
Test recursive single-item force repro on dir
without any dependencies changing.
"""
stages = dvc_repo.reproduce(
repro_dir.dname, recursive=True, single_item=True, force=True
)
stages = dvc.reproduce("dir", recursive=True, single_item=True, force=True)
assert len(stages) == 3
names = [stage.relpath for stage in stages]
# Check that all stages inside given dir have been reproduced
# Also check that "second" stage was reproduced before "third" stage
# Also check that "dir_file_copy" stage was reproduced before "third" stage
# and that "first" stage was reproduced before "third" stage
assert repro_dir.first_stage in names
assert repro_dir.second_stage in names
assert repro_dir.third_stage in names
assert names.index(repro_dir.first_stage) < names.index(
repro_dir.third_stage
names = [s.relpath for s in stages]
assert {
repro_dir.origin_copy_2 + ".dvc",
repro_dir.dir_file_copy + ".dvc",
repro_dir.last_stage,
} == set(names)
assert names.index(repro_dir.origin_copy_2 + ".dvc") < names.index(
repro_dir.last_stage
)
assert names.index(repro_dir.second_stage) < names.index(
repro_dir.third_stage
assert names.index(repro_dir.dir_file_copy + ".dvc") < names.index(
repro_dir.last_stage
)


def test_recursive_repro_empty_dir(dvc_repo, repro_dir):
def test_recursive_repro_empty_dir(dvc, repro_dir):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I presume that repro_dir was added here to make sure running repro, will not trigger any other stage?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pared I think it was only used for repro_dir.emtydname, so we can safely remove it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@efiop removed

"""
Test recursive repro on an empty directory
"""
stages = dvc_repo.reproduce(
repro_dir.emptydname, recursive=True, force=True
)
(repro_dir / "emptydir").mkdir()

stages = dvc.reproduce("emptydir", recursive=True, force=True)
assert len(stages) == 0


def test_recursive_repro_recursive_missing_file(dvc_repo):
def test_recursive_repro_recursive_missing_file(dvc):
"""
Test recursive repro on a missing file
"""
with pytest.raises(StageFileDoesNotExistError):
dvc_repo.reproduce("notExistingStage.dvc", recursive=True)
dvc.reproduce("notExistingStage.dvc", recursive=True)
with pytest.raises(StageFileDoesNotExistError):
dvc_repo.reproduce("notExistingDir/", recursive=True)
dvc.reproduce("notExistingDir/", recursive=True)


def test_recursive_repro_on_stage_file(dvc_repo, repro_dir):
def test_recursive_repro_on_stage_file(dvc, repro_dir):
"""
Test recursive repro on a stage file instead of directory
"""
stages = dvc_repo.reproduce(
repro_dir.first_stage, recursive=True, force=True
stages = dvc.reproduce(
repro_dir.origin_copy_2 + ".dvc", recursive=True, force=True
)
assert len(stages) == 2
names = [stage.relpath for stage in stages]
assert repro_dir.source_stage in names
assert repro_dir.first_stage in names


@pytest.fixture
def foo_copy(repo_dir, dvc_repo):
stages = dvc_repo.add(repo_dir.FOO)
assert len(stages) == 1
foo_stage = stages[0]
assert foo_stage is not None

fname = "foo_copy"
stage_fname = fname + ".dvc"
dvc_repo.run(
fname=stage_fname,
outs=[fname],
deps=[repo_dir.FOO, repo_dir.CODE],
cmd="python {} {} {}".format(repo_dir.CODE, repo_dir.FOO, fname),
)
return {"fname": fname, "stage_fname": stage_fname}
assert [
repro_dir.origin_copy + ".dvc",
repro_dir.origin_copy_2 + ".dvc",
] == [stage.relpath for stage in stages]


def test_dvc_formatting_retained(dvc_repo, foo_copy):
root = pathlib.Path(dvc_repo.root_dir)
stage_file = root / foo_copy["stage_fname"]
def test_dvc_formatting_retained(tmp_dir, dvc, run_copy):
tmp_dir.dvc_gen("foo", "foo content")
stage = run_copy("foo", "foo_copy", fname="foo_copy.dvc")
stage_path = tmp_dir / stage.relpath

# Add comments and custom formatting to DVC-file
lines = list(map(_format_dvc_line, stage_file.read_text().splitlines()))
lines = list(map(_format_dvc_line, stage_path.read_text().splitlines()))
lines.insert(0, "# Starting comment")
stage_text = "".join(l + "\n" for l in lines)
stage_file.write_text(stage_text)
stage_path.write_text(stage_text)

# Rewrite data source and repro
(root / "foo").write_text("new_foo")
dvc_repo.reproduce(foo_copy["stage_fname"])
(tmp_dir / "foo").write_text("new foo")
dvc.reproduce("foo_copy.dvc", force=True)

# All differences should be only about md5
assert _hide_md5(stage_text) == _hide_md5(stage_file.read_text())
assert _hide_md5(stage_text) == _hide_md5(stage_path.read_text())


def _format_dvc_line(line):
Expand Down Expand Up @@ -1630,7 +1620,7 @@ def test(self):
assert evaluation[2].relpath == "E.dvc"


def test_ssh_dir_out(dvc_repo):
def test_ssh_dir_out(dvc):
if not _should_test_ssh():
pytest.skip()

Expand All @@ -1643,7 +1633,7 @@ def test_ssh_dir_out(dvc_repo):
assert main(["config", "cache.ssh", "sshcache"]) == 0

# Recreating to reread configs
repo = DvcRepo(dvc_repo.root_dir)
repo = DvcRepo(dvc.root_dir)

url_info = URLInfo(remote_url)
mkdir_cmd = "mkdir dir-out;cd dir-out;echo 1 > 1.txt; echo 2 > 2.txt"
Expand Down