Skip to content

Commit

Permalink
dvc: compute dir hash without external cache (iterative#4528)
Browse files Browse the repository at this point in the history
  • Loading branch information
efiop authored Sep 5, 2020
1 parent 79e8e4e commit 636a019
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 23 deletions.
9 changes: 5 additions & 4 deletions dvc/cache/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,13 +280,14 @@ def _get_dir_info_hash(self, dir_info):
return hash_info, to_info

def save_dir_info(self, dir_info, hash_info=None):
if hash_info and not self.changed_cache_file(hash_info):
if (
hash_info
and hash_info.name == self.tree.PARAM_CHECKSUM
and not self.changed_cache_file(hash_info)
):
return hash_info

hi, tmp_info = self._get_dir_info_hash(dir_info)
if hash_info:
assert hi == hash_info

new_info = self.tree.hash_to_path_info(hi.value)
if self.changed_cache_file(hi):
self.tree.makedirs(new_info.parent)
Expand Down
11 changes: 2 additions & 9 deletions dvc/tree/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,7 @@

from funcy import cached_property

from dvc.exceptions import (
DvcException,
DvcIgnoreInCollectedDirError,
RemoteCacheRequiredError,
)
from dvc.exceptions import DvcException, DvcIgnoreInCollectedDirError
from dvc.hash_info import HashInfo
from dvc.ignore import DvcIgnore
from dvc.path_info import URLInfo
Expand Down Expand Up @@ -331,11 +327,8 @@ def _collect_dir(self, path_info, **kwargs):
]

def get_dir_hash(self, path_info, **kwargs):
if not self.cache:
raise RemoteCacheRequiredError(path_info)

dir_info = self._collect_dir(path_info, **kwargs)
return self.cache.save_dir_info(dir_info)
return self.repo.cache.local.save_dir_info(dir_info)

def upload(self, from_info, to_info, name=None, no_progress_bar=False):
if not hasattr(self, "_upload"):
Expand Down
46 changes: 39 additions & 7 deletions tests/func/test_import_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import pytest

from dvc.cache import Cache
from dvc.dependency.base import DependencyDoesNotExistError
from dvc.main import main
from dvc.stage import Stage
Expand Down Expand Up @@ -141,28 +142,59 @@ def test_import_url(tmp_dir, dvc, workspace):


@pytest.mark.parametrize(
"workspace",
"workspace, stage_md5, dir_md5",
[
pytest.lazy_fixture("local_cloud"),
pytest.lazy_fixture("s3"),
pytest.lazy_fixture("gs"),
pytest.lazy_fixture("hdfs"),
(
pytest.lazy_fixture("local_cloud"),
"dc24e1271084ee317ac3c2656fb8812b",
"b6dcab6ccd17ca0a8bf4a215a37d14cc.dir",
),
(
pytest.lazy_fixture("s3"),
"2aa17f8daa26996b3f7a4cf8888ac9ac",
"ec602a6ba97b2dd07bd6d2cd89674a60.dir",
),
(pytest.lazy_fixture("gs"), "fixme", "fixme",),
(
pytest.lazy_fixture("hdfs"),
"ec0943f83357f702033c98e70b853c8c",
"e6dcd267966dc628d732874f94ef4280.dir",
),
pytest.param(
pytest.lazy_fixture("ssh"),
"dc24e1271084ee317ac3c2656fb8812b",
"b6dcab6ccd17ca0a8bf4a215a37d14cc.dir",
marks=pytest.mark.skipif(
os.name == "nt", reason="disabled on windows"
),
),
],
indirect=True,
indirect=["workspace"],
)
def test_import_url_dir(tmp_dir, dvc, workspace):
def test_import_url_dir(tmp_dir, dvc, workspace, stage_md5, dir_md5):
workspace.gen({"dir": {"file": "file", "subdir": {"subfile": "subfile"}}})

# remove external cache to make sure that we don't need it to import dirs
with dvc.config.edit() as conf:
del conf["cache"]
dvc.cache = Cache(dvc)

assert not (tmp_dir / "dir").exists() # sanity check
dvc.imp_url("remote://workspace/dir")
assert set(os.listdir(tmp_dir / "dir")) == {"file", "subdir"}
assert (tmp_dir / "dir" / "file").read_text() == "file"
assert list(os.listdir(tmp_dir / "dir" / "subdir")) == ["subfile"]
assert (tmp_dir / "dir" / "subdir" / "subfile").read_text() == "subfile"

assert (tmp_dir / "dir.dvc").read_text() == (
f"md5: {stage_md5}\n"
"frozen: true\n"
"deps:\n"
f"- md5: {dir_md5}\n"
" path: remote://workspace/dir\n"
"outs:\n"
"- md5: b6dcab6ccd17ca0a8bf4a215a37d14cc.dir\n"
" path: dir\n"
)

assert dvc.status() == {}
6 changes: 3 additions & 3 deletions tests/func/test_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@
from mock import patch

from dvc.config import Config
from dvc.exceptions import DownloadError, UploadError
from dvc.exceptions import DownloadError, RemoteCacheRequiredError, UploadError
from dvc.main import main
from dvc.path_info import PathInfo
from dvc.tree.base import BaseTree, RemoteCacheRequiredError
from dvc.tree.base import BaseTree
from dvc.tree.local import LocalTree
from dvc.utils.fs import remove
from tests.basic_env import TestDvc
Expand Down Expand Up @@ -262,7 +262,7 @@ def test_external_dir_resource_on_no_cache(tmp_dir, dvc, tmp_path_factory):
with pytest.raises(RemoteCacheRequiredError):
dvc.run(
cmd="echo hello world",
deps=[os.fspath(external_dir)],
outs=[os.fspath(external_dir)],
single_stage=True,
)

Expand Down

0 comments on commit 636a019

Please sign in to comment.