Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

output: Don't serialize hash_info or meta in cloud versioning. #8849

Merged
merged 1 commit into from
Jan 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 24 additions & 16 deletions dvc/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,14 @@ def __init__(
name=self.hash_name,
value=getattr(self.meta, self.hash_name, None),
)
if self.meta.nfiles or self.hash_info and self.hash_info.isdir:
if self.files:
tree = Tree.from_list(self.files, hash_name=self.hash_name)
tree.digest()
self.hash_info = tree.hash_info
Comment on lines +372 to +374
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This introduces overhead but need to check how big it is

self.meta.isdir = True
self.meta.nfiles = len(self.files)
self.meta.size = sum(file.get("size") for file in self.files)
elif self.meta.nfiles or self.hash_info and self.hash_info.isdir:
self.meta.isdir = True
if not self.hash_info and self.hash_name != "md5":
md5 = getattr(self.meta, "md5", None)
Expand Down Expand Up @@ -747,7 +754,22 @@ def _commit_granular_dir(self, filter_info) -> Optional["HashFile"]:
def dumpd(self, **kwargs): # noqa: C901
meta = self.meta.to_dict()
meta.pop("isdir", None)
ret: Dict[str, Any] = {**self.hash_info.to_dict(), **meta}
ret: Dict[str, Any] = {}
if (
(not self.IS_DEPENDENCY or self.stage.is_import)
and self.hash_info.isdir
and (kwargs.get("with_files") or self.files is not None)
):
Comment on lines +758 to +762
Copy link
Member

@skshetry skshetry Jan 25, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason to move this upward @daavoo? This might affect the ordering of metadata in .dvc file. We probably want files to appear at the end.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No real reason. Could be moved bellow

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Created #8879.

obj: Optional["HashFile"]
if self.obj:
obj = self.obj
else:
obj = self.get_obj()
if obj:
obj = cast(Tree, obj)
ret[self.PARAM_FILES] = obj.as_list(with_meta=True)
else:
ret = {**self.hash_info.to_dict(), **meta}

if self.is_in_repo:
path = self.fs.path.as_posix(
Expand Down Expand Up @@ -788,20 +810,6 @@ def dumpd(self, **kwargs): # noqa: C901
if not self.can_push:
ret[self.PARAM_PUSH] = self.can_push

if (
(not self.IS_DEPENDENCY or self.stage.is_import)
and self.hash_info.isdir
and (kwargs.get("with_files") or self.files is not None)
):
obj: Optional["HashFile"]
if self.obj:
obj = self.obj
else:
obj = self.get_obj()
if obj:
obj = cast(Tree, obj)
ret[self.PARAM_FILES] = obj.as_list(with_meta=True)

return ret

def verify_metric(self):
Expand Down
10 changes: 4 additions & 6 deletions dvc/stage/serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,18 +158,16 @@ def to_single_stage_lockfile(stage: "Stage", **kwargs) -> dict:
def _dumpd(item):
meta_d = item.meta.to_dict()
meta_d.pop("isdir", None)
ret = [
(item.PARAM_PATH, item.def_path),
*item.hash_info.to_dict().items(),
*meta_d.items(),
]

if item.hash_info.isdir and kwargs.get("with_files"):
if item.obj:
obj = item.obj
else:
obj = item.get_obj()
ret.append((item.PARAM_FILES, obj.as_list(with_meta=True)))
ret = [((item.PARAM_FILES, obj.as_list(with_meta=True)))]
else:
ret = [*item.hash_info.to_dict().items(), *meta_d.items()]
ret.insert(0, (item.PARAM_PATH, item.def_path))

return OrderedDict(ret)

Expand Down
58 changes: 58 additions & 0 deletions tests/func/test_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -1142,3 +1142,61 @@ def test_add_with_annotations(M, tmp_dir, dvc):
(stage,) = dvc.add("foo", type="t2")
assert stage.outs[0].annot == Annotation(**annot)
assert (tmp_dir / "foo.dvc").parse() == M.dict(outs=[M.dict(**annot)])


def test_add_updates_to_cloud_versioning_dir(tmp_dir, dvc):
data_dvc = tmp_dir / "data.dvc"
data_dvc.dump(
{
"outs": [
{
"path": "data",
"files": [
{
"size": 3,
"version_id": "WYRG4BglP7pD.gEoJP6a4AqOhl.FRA.h",
"etag": "acbd18db4cc2f85cedef654fccc4a4d8",
"md5": "acbd18db4cc2f85cedef654fccc4a4d8",
"relpath": "bar",
},
{
"size": 3,
"version_id": "0vL53tFVY5vVAoJ4HG2jCS1mEcohDPE0",
"etag": "acbd18db4cc2f85cedef654fccc4a4d8",
"md5": "acbd18db4cc2f85cedef654fccc4a4d8",
"relpath": "foo",
},
],
}
]
}
)

data = tmp_dir / "data"
data.mkdir()
(data / "foo").write_text("foo")
(data / "bar").write_text("bar2")

dvc.add("data")

assert (tmp_dir / "data.dvc").parse() == {
"outs": [
{
"path": "data",
"files": [
{
"size": 4,
"md5": "224e2539f52203eb33728acd228b4432",
"relpath": "bar",
},
{
"size": 3,
"version_id": "0vL53tFVY5vVAoJ4HG2jCS1mEcohDPE0",
"etag": "acbd18db4cc2f85cedef654fccc4a4d8",
"md5": "acbd18db4cc2f85cedef654fccc4a4d8",
"relpath": "foo",
},
],
}
]
}
55 changes: 55 additions & 0 deletions tests/unit/output/test_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,3 +121,58 @@ def test_remote_missing_dependency_on_dir_pull(tmp_dir, scm, dvc, mocker):
)
with pytest.raises(RemoteMissingDepsError):
dvc.pull()


def test_hash_info_cloud_versioning_dir(mocker):
stage = mocker.MagicMock()
stage.repo.fs.version_aware = False
stage.repo.fs.PARAM_CHECKSUM = "etag"
files = [
{
"size": 3,
"version_id": "WYRG4BglP7pD.gEoJP6a4AqOhl.FRA.h",
"etag": "acbd18db4cc2f85cedef654fccc4a4d8",
"md5": "acbd18db4cc2f85cedef654fccc4a4d8",
"relpath": "bar",
},
{
"size": 3,
"version_id": "0vL53tFVY5vVAoJ4HG2jCS1mEcohDPE0",
"etag": "acbd18db4cc2f85cedef654fccc4a4d8",
"md5": "acbd18db4cc2f85cedef654fccc4a4d8",
"relpath": "foo",
},
]
out = Output(stage, "path", files=files)
# `hash_info`` and `meta`` constructed from `files``
assert out.hash_info.name == "md5"
assert out.hash_info.value == "77e8000f532886eef8ee1feba82e9bad.dir"
assert out.meta.isdir
assert out.meta.nfiles == 2
assert out.meta.size == 6


def test_dumpd_cloud_versioning_dir(mocker):
stage = mocker.MagicMock()
stage.repo.fs.version_aware = False
stage.repo.fs.PARAM_CHECKSUM = "md5"
files = [
{
"size": 3,
"version_id": "WYRG4BglP7pD.gEoJP6a4AqOhl.FRA.h",
"etag": "acbd18db4cc2f85cedef654fccc4a4d8",
"md5": "acbd18db4cc2f85cedef654fccc4a4d8",
"relpath": "bar",
},
{
"size": 3,
"version_id": "0vL53tFVY5vVAoJ4HG2jCS1mEcohDPE0",
"etag": "acbd18db4cc2f85cedef654fccc4a4d8",
"md5": "acbd18db4cc2f85cedef654fccc4a4d8",
"relpath": "foo",
},
]
out = Output(stage, "path", files=files)

dumpd = out.dumpd()
assert dumpd == {"path": "path", "files": files}
25 changes: 25 additions & 0 deletions tests/unit/stage/test_serialize_pipeline_lock.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,3 +255,28 @@ def test_to_lockfile(dvc):
]
)
}


def test_to_single_stage_lockfile_cloud_versioning_dir(dvc):
stage = create_stage(PipelineStage, dvc, outs=["dir"], **kwargs)
stage.outs[0].hash_info = HashInfo("md5", "md-five.dir")
files = [
{
"size": 3,
"version_id": "WYRG4BglP7pD.gEoJP6a4AqOhl.FRA.h",
"etag": "acbd18db4cc2f85cedef654fccc4a4d8",
"md5": "acbd18db4cc2f85cedef654fccc4a4d8",
"relpath": "bar",
},
{
"size": 3,
"version_id": "0vL53tFVY5vVAoJ4HG2jCS1mEcohDPE0",
"etag": "acbd18db4cc2f85cedef654fccc4a4d8",
"md5": "acbd18db4cc2f85cedef654fccc4a4d8",
"relpath": "foo",
},
]
stage.outs[0].files = files
e = _to_single_stage_lockfile(stage, with_files=True)
assert Schema(e)
assert e["outs"][0] == {"path": "dir", "files": files}