From 291418f7e13c09a722c69f8c0a520b9324d13bf0 Mon Sep 17 00:00:00 2001 From: David de la Iglesia Castro Date: Fri, 20 Jan 2023 11:21:21 +0100 Subject: [PATCH] output: Don't serialize `hash_info` or `meta` in cloud versioning. Closes #8357 --- dvc/output.py | 37 +++++++----- dvc/stage/serialize.py | 10 ++-- tests/func/test_add.py | 58 +++++++++++++++++++ tests/unit/output/test_output.py | 52 +++++++++++++++++ .../stage/test_serialize_pipeline_lock.py | 25 ++++++++ 5 files changed, 160 insertions(+), 22 deletions(-) diff --git a/dvc/output.py b/dvc/output.py index d97a2e9bd1a..ec84dc0437f 100644 --- a/dvc/output.py +++ b/dvc/output.py @@ -368,7 +368,11 @@ def __init__( name=self.hash_name, value=getattr(self.meta, self.hash_name, None), ) - if self.meta.nfiles or self.hash_info and self.hash_info.isdir: + if self.files: + tree = Tree.from_list(self.files, hash_name=self.hash_name) + tree.digest() + self.hash_info = tree.hash_info + elif self.meta.nfiles or self.hash_info and self.hash_info.isdir: self.meta.isdir = True if not self.hash_info and self.hash_name != "md5": md5 = getattr(self.meta, "md5", None) @@ -747,7 +751,22 @@ def _commit_granular_dir(self, filter_info) -> Optional["HashFile"]: def dumpd(self, **kwargs): # noqa: C901 meta = self.meta.to_dict() meta.pop("isdir", None) - ret: Dict[str, Any] = {**self.hash_info.to_dict(), **meta} + ret: Dict[str, Any] = {} + if ( + (not self.IS_DEPENDENCY or self.stage.is_import) + and self.hash_info.isdir + and (kwargs.get("with_files") or self.files is not None) + ): + obj: Optional["HashFile"] + if self.obj: + obj = self.obj + else: + obj = self.get_obj() + if obj: + obj = cast(Tree, obj) + ret[self.PARAM_FILES] = obj.as_list(with_meta=True) + else: + ret = {**self.hash_info.to_dict(), **meta} if self.is_in_repo: path = self.fs.path.as_posix( @@ -788,20 +807,6 @@ def dumpd(self, **kwargs): # noqa: C901 if not self.can_push: ret[self.PARAM_PUSH] = self.can_push - if ( - (not self.IS_DEPENDENCY or self.stage.is_import) - and self.hash_info.isdir - and (kwargs.get("with_files") or self.files is not None) - ): - obj: Optional["HashFile"] - if self.obj: - obj = self.obj - else: - obj = self.get_obj() - if obj: - obj = cast(Tree, obj) - ret[self.PARAM_FILES] = obj.as_list(with_meta=True) - return ret def verify_metric(self): diff --git a/dvc/stage/serialize.py b/dvc/stage/serialize.py index 686b302d091..0ab5ad9883d 100644 --- a/dvc/stage/serialize.py +++ b/dvc/stage/serialize.py @@ -158,18 +158,16 @@ def to_single_stage_lockfile(stage: "Stage", **kwargs) -> dict: def _dumpd(item): meta_d = item.meta.to_dict() meta_d.pop("isdir", None) - ret = [ - (item.PARAM_PATH, item.def_path), - *item.hash_info.to_dict().items(), - *meta_d.items(), - ] if item.hash_info.isdir and kwargs.get("with_files"): if item.obj: obj = item.obj else: obj = item.get_obj() - ret.append((item.PARAM_FILES, obj.as_list(with_meta=True))) + ret = [((item.PARAM_FILES, obj.as_list(with_meta=True)))] + else: + ret = [*item.hash_info.to_dict().items(), *meta_d.items()] + ret.insert(0, (item.PARAM_PATH, item.def_path)) return OrderedDict(ret) diff --git a/tests/func/test_add.py b/tests/func/test_add.py index bfb8c405d9e..0299797541c 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -1142,3 +1142,61 @@ def test_add_with_annotations(M, tmp_dir, dvc): (stage,) = dvc.add("foo", type="t2") assert stage.outs[0].annot == Annotation(**annot) assert (tmp_dir / "foo.dvc").parse() == M.dict(outs=[M.dict(**annot)]) + + +def test_add_updates_to_cloud_versioning_dir(tmp_dir, dvc): + data_dvc = tmp_dir / "data.dvc" + data_dvc.dump( + { + "outs": [ + { + "path": "data", + "files": [ + { + "size": 3, + "version_id": "WYRG4BglP7pD.gEoJP6a4AqOhl.FRA.h", + "etag": "acbd18db4cc2f85cedef654fccc4a4d8", + "md5": "acbd18db4cc2f85cedef654fccc4a4d8", + "relpath": "bar", + }, + { + "size": 3, + "version_id": "0vL53tFVY5vVAoJ4HG2jCS1mEcohDPE0", + "etag": "acbd18db4cc2f85cedef654fccc4a4d8", + "md5": "acbd18db4cc2f85cedef654fccc4a4d8", + "relpath": "foo", + }, + ], + } + ] + } + ) + + data = tmp_dir / "data" + data.mkdir() + (data / "foo").write_text("foo") + (data / "bar").write_text("bar2") + + dvc.add("data") + + assert (tmp_dir / "data.dvc").parse() == { + "outs": [ + { + "path": "data", + "files": [ + { + "size": 4, + "md5": "224e2539f52203eb33728acd228b4432", + "relpath": "bar", + }, + { + "size": 3, + "version_id": "0vL53tFVY5vVAoJ4HG2jCS1mEcohDPE0", + "etag": "acbd18db4cc2f85cedef654fccc4a4d8", + "md5": "acbd18db4cc2f85cedef654fccc4a4d8", + "relpath": "foo", + }, + ], + } + ] + } diff --git a/tests/unit/output/test_output.py b/tests/unit/output/test_output.py index 740605e6f2a..6ad0874493d 100644 --- a/tests/unit/output/test_output.py +++ b/tests/unit/output/test_output.py @@ -121,3 +121,55 @@ def test_remote_missing_dependency_on_dir_pull(tmp_dir, scm, dvc, mocker): ) with pytest.raises(RemoteMissingDepsError): dvc.pull() + + +def test_hash_info_cloud_versioning_dir(mocker): + stage = mocker.MagicMock() + stage.repo.fs.version_aware = False + stage.repo.fs.PARAM_CHECKSUM = "etag" + files = [ + { + "size": 3, + "version_id": "WYRG4BglP7pD.gEoJP6a4AqOhl.FRA.h", + "etag": "acbd18db4cc2f85cedef654fccc4a4d8", + "md5": "acbd18db4cc2f85cedef654fccc4a4d8", + "relpath": "bar", + }, + { + "size": 3, + "version_id": "0vL53tFVY5vVAoJ4HG2jCS1mEcohDPE0", + "etag": "acbd18db4cc2f85cedef654fccc4a4d8", + "md5": "acbd18db4cc2f85cedef654fccc4a4d8", + "relpath": "foo", + }, + ] + out = Output(stage, "path", files=files) + # hash_info constructed from files + assert out.hash_info.name == "md5" + assert out.hash_info.value == "77e8000f532886eef8ee1feba82e9bad.dir" + + +def test_dumpd_cloud_versioning_dir(mocker): + stage = mocker.MagicMock() + stage.repo.fs.version_aware = False + stage.repo.fs.PARAM_CHECKSUM = "md5" + files = [ + { + "size": 3, + "version_id": "WYRG4BglP7pD.gEoJP6a4AqOhl.FRA.h", + "etag": "acbd18db4cc2f85cedef654fccc4a4d8", + "md5": "acbd18db4cc2f85cedef654fccc4a4d8", + "relpath": "bar", + }, + { + "size": 3, + "version_id": "0vL53tFVY5vVAoJ4HG2jCS1mEcohDPE0", + "etag": "acbd18db4cc2f85cedef654fccc4a4d8", + "md5": "acbd18db4cc2f85cedef654fccc4a4d8", + "relpath": "foo", + }, + ] + out = Output(stage, "path", files=files) + + dumpd = out.dumpd() + assert dumpd == {"path": "path", "files": files} diff --git a/tests/unit/stage/test_serialize_pipeline_lock.py b/tests/unit/stage/test_serialize_pipeline_lock.py index 53905fe3708..342c89ec3a1 100644 --- a/tests/unit/stage/test_serialize_pipeline_lock.py +++ b/tests/unit/stage/test_serialize_pipeline_lock.py @@ -255,3 +255,28 @@ def test_to_lockfile(dvc): ] ) } + + +def test_to_single_stage_lockfile_cloud_versioning_dir(dvc): + stage = create_stage(PipelineStage, dvc, outs=["dir"], **kwargs) + stage.outs[0].hash_info = HashInfo("md5", "md-five.dir") + files = [ + { + "size": 3, + "version_id": "WYRG4BglP7pD.gEoJP6a4AqOhl.FRA.h", + "etag": "acbd18db4cc2f85cedef654fccc4a4d8", + "md5": "acbd18db4cc2f85cedef654fccc4a4d8", + "relpath": "bar", + }, + { + "size": 3, + "version_id": "0vL53tFVY5vVAoJ4HG2jCS1mEcohDPE0", + "etag": "acbd18db4cc2f85cedef654fccc4a4d8", + "md5": "acbd18db4cc2f85cedef654fccc4a4d8", + "relpath": "foo", + }, + ] + stage.outs[0].files = files + e = _to_single_stage_lockfile(stage, with_files=True) + assert Schema(e) + assert e["outs"][0] == {"path": "dir", "files": files}