From e1b82c5222930c55886ca16a48c3a223d05b4af0 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Wed, 4 Nov 2020 19:22:57 +0200 Subject: [PATCH] dvc: add size/nfiles for deps/outs (#4836) * dvc: add size for deps/outs Related to #3256 * dvc: add nfiles for deps/outs * dvc: put size/nfiles into the hash_info --- dvc/cache/base.py | 39 ++++++++++++++++++------- dvc/cache/local.py | 3 +- dvc/hash_info.py | 28 +++++++++++++++--- dvc/output/__init__.py | 3 ++ dvc/output/base.py | 11 +++++-- dvc/schema.py | 8 +++++- dvc/stage/cache.py | 6 +++- dvc/stage/serialize.py | 18 ++++++------ dvc/stage/utils.py | 5 ++++ dvc/state.py | 30 ++++++++++--------- dvc/tree/base.py | 48 +++++++++++++++++++------------ dvc/tree/hdfs.py | 9 ++++-- dvc/tree/local.py | 7 ++++- dvc/tree/s3.py | 6 +++- dvc/tree/ssh/__init__.py | 7 ++++- tests/func/test_add.py | 11 ++++++- tests/func/test_import_url.py | 4 +++ tests/func/test_install.py | 4 +++ tests/func/test_lockfile.py | 8 ++++-- tests/func/test_remote.py | 16 +++++------ tests/func/test_repro.py | 15 ++++++---- tests/func/test_run_multistage.py | 2 ++ tests/func/test_state.py | 20 ++++++------- 23 files changed, 214 insertions(+), 94 deletions(-) diff --git a/dvc/cache/base.py b/dvc/cache/base.py index 6f3240da02..7f9acab1b1 100644 --- a/dvc/cache/base.py +++ b/dvc/cache/base.py @@ -227,7 +227,7 @@ def _save_file(self, path_info, tree, hash_info, save_link=True, **kwargs): # we need to update path and cache, since in case of reflink, # or copy cache type moving original file results in updates on # next executed command, which causes md5 recalculation - self.tree.state.save(path_info, hash_info.value) + self.tree.state.save(path_info, hash_info) else: if self.changed_cache(hash_info): with tree.open(path_info, mode="rb") as fobj: @@ -241,7 +241,7 @@ def _save_file(self, path_info, tree, hash_info, save_link=True, **kwargs): if callback: callback(1) - self.tree.state.save(cache_info, hash_info.value) + self.tree.state.save(cache_info, hash_info) def _cache_is_copy(self, path_info): """Checks whether cache uses copies.""" @@ -287,6 +287,7 @@ def _get_dir_info_hash(self, dir_info): hash_info = self.tree.get_file_hash(to_info) hash_info.value += self.tree.CHECKSUM_DIR_SUFFIX hash_info.dir_info = self._to_dict(dir_info) + hash_info.nfiles = len(dir_info) return hash_info, to_info @@ -305,7 +306,7 @@ def save_dir_info(self, dir_info, hash_info=None): self.tree.makedirs(new_info.parent) self.tree.move(tmp_info, new_info, mode=self.CACHE_MODE) - self.tree.state.save(new_info, hi.value) + self.tree.state.save(new_info, hi) return hi @@ -326,10 +327,10 @@ def _save_dir(self, path_info, tree, hash_info, save_link=True, **kwargs): if save_link: self.tree.state.save_link(path_info) if self.tree.exists(path_info): - self.tree.state.save(path_info, hi.value) + self.tree.state.save(path_info, hi) cache_info = self.tree.hash_to_path_info(hi.value) - self.tree.state.save(cache_info, hi.value) + self.tree.state.save(cache_info, hi) @use_state def save(self, path_info, tree, hash_info, save_link=True, **kwargs): @@ -461,7 +462,7 @@ def _checkout_file( self.link(cache_info, path_info) self.tree.state.save_link(path_info) - self.tree.state.save(path_info, hash_info.value) + self.tree.state.save(path_info, hash_info) if progress_callback: progress_callback(str(path_info)) @@ -501,7 +502,7 @@ def _checkout_dir( modified = True self.safe_remove(entry_info, force=force) self.link(entry_cache_info, entry_info) - self.tree.state.save(entry_info, entry_hash) + self.tree.state.save(entry_info, entry_hash_info) if progress_callback: progress_callback(str(entry_info)) @@ -511,7 +512,7 @@ def _checkout_dir( ) self.tree.state.save_link(path_info) - self.tree.state.save(path_info, hash_info.value) + self.tree.state.save(path_info, hash_info) # relink is not modified, assume it as nochange return added, not added and modified and not relink @@ -690,9 +691,20 @@ def _merge_dirs(self, ancestor_info, our_info, their_info): # Sorting the list by path to ensure reproducibility return sorted( - self._from_dict(merged), key=itemgetter(self.tree.PARAM_RELPATH) + self._from_dict(merged), key=itemgetter(self.tree.PARAM_RELPATH), ) + def _get_dir_size(self, dir_info): + def _getsize(entry): + return self.tree.getsize( + self.tree.hash_to_path_info(entry[self.tree.PARAM_CHECKSUM]) + ) + + try: + return sum(_getsize(entry) for entry in dir_info) + except FileNotFoundError: + return None + def merge(self, ancestor_info, our_info, their_info): assert our_info assert their_info @@ -706,7 +718,9 @@ def merge(self, ancestor_info, our_info, their_info): their = self.get_dir_cache(their_info) merged = self._merge_dirs(ancestor, our, their) - return self.save_dir_info(merged) + hash_info = self.save_dir_info(merged) + hash_info.size = self._get_dir_size(merged) + return hash_info @use_state def get_hash(self, tree, path_info): @@ -715,9 +729,12 @@ def get_hash(self, tree, path_info): assert hash_info.name == self.tree.PARAM_CHECKSUM return hash_info - return self.save_dir_info(hash_info.dir_info, hash_info) + hi = self.save_dir_info(hash_info.dir_info, hash_info) + hi.size = hash_info.size + return hi def set_dir_info(self, hash_info): assert hash_info.isdir hash_info.dir_info = self._to_dict(self.get_dir_cache(hash_info)) + hash_info.nfiles = len(hash_info.dir_info) diff --git a/dvc/cache/local.py b/dvc/cache/local.py index 1b4a99c4cb..547c890895 100644 --- a/dvc/cache/local.py +++ b/dvc/cache/local.py @@ -451,7 +451,8 @@ def pull(self, named_cache, remote, jobs=None, show_checksums=False): # be removed upon status, while files corrupted during # download will not be moved from tmp_file # (see `BaseTree.download()`) - self.tree.state.save(cache_file, checksum) + hash_info = HashInfo(self.tree.PARAM_CHECKSUM, checksum) + self.tree.state.save(cache_file, hash_info) return ret diff --git a/dvc/hash_info.py b/dvc/hash_info.py index 826b4e1dfe..a4bcbd1fe3 100644 --- a/dvc/hash_info.py +++ b/dvc/hash_info.py @@ -1,3 +1,4 @@ +from collections import OrderedDict from dataclasses import dataclass, field HASH_DIR_SUFFIX = ".dir" @@ -5,22 +6,41 @@ @dataclass class HashInfo: + PARAM_SIZE = "size" + PARAM_NFILES = "nfiles" + name: str value: str dir_info: dict = field(default=None, compare=False) + size: int = field(default=None, compare=False) + nfiles: int = field(default=None, compare=False) def __bool__(self): return bool(self.value) @classmethod def from_dict(cls, d): - if not d: + _d = d.copy() if d else {} + size = _d.pop(cls.PARAM_SIZE, None) + nfiles = _d.pop(cls.PARAM_NFILES, None) + + if not _d: return cls(None, None) - ((name, value),) = d.items() - return cls(name, value) + + ((name, value),) = _d.items() + return cls(name, value, size=size, nfiles=nfiles) def to_dict(self): - return {self.name: self.value} if self else {} + ret = OrderedDict() + if not self: + return ret + + ret[self.name] = self.value + if self.size is not None: + ret[self.PARAM_SIZE] = self.size + if self.nfiles is not None: + ret[self.PARAM_NFILES] = self.nfiles + return ret @property def isdir(self): diff --git a/dvc/output/__init__.py b/dvc/output/__init__.py index 7cfdf9e0fb..389860b462 100644 --- a/dvc/output/__init__.py +++ b/dvc/output/__init__.py @@ -4,6 +4,7 @@ from funcy import collecting, project from voluptuous import And, Any, Coerce, Length, Lower, Required, SetTo +from dvc.hash_info import HashInfo from dvc.output.base import BaseOutput from dvc.output.gs import GSOutput from dvc.output.hdfs import HDFSOutput @@ -59,6 +60,8 @@ SCHEMA[BaseOutput.PARAM_METRIC] = BaseOutput.METRIC_SCHEMA SCHEMA[BaseOutput.PARAM_PLOT] = bool SCHEMA[BaseOutput.PARAM_PERSIST] = bool +SCHEMA[HashInfo.PARAM_SIZE] = int +SCHEMA[HashInfo.PARAM_NFILES] = int def _get( diff --git a/dvc/output/base.py b/dvc/output/base.py index 24af234bc9..b70936f576 100644 --- a/dvc/output/base.py +++ b/dvc/output/base.py @@ -530,8 +530,15 @@ def _check_can_merge(self, out): my = self.dumpd() other = out.dumpd() - my.pop(self.tree.PARAM_CHECKSUM) - other.pop(self.tree.PARAM_CHECKSUM) + ignored = [ + self.tree.PARAM_CHECKSUM, + HashInfo.PARAM_SIZE, + HashInfo.PARAM_NFILES, + ] + + for opt in ignored: + my.pop(opt, None) + other.pop(opt, None) if my != other: raise MergeError( diff --git a/dvc/schema.py b/dvc/schema.py index f11f37c13d..7c470edf2e 100644 --- a/dvc/schema.py +++ b/dvc/schema.py @@ -1,6 +1,7 @@ from voluptuous import Any, Optional, Required, Schema from dvc import dependency, output +from dvc.hash_info import HashInfo from dvc.output import CHECKSUMS_SCHEMA, BaseOutput from dvc.parsing import FOREACH_KWD, IN_KWD, SET_KWD, USE_KWD, VARS_KWD from dvc.stage.params import StageParams @@ -18,7 +19,12 @@ StageParams.PARAM_ALWAYS_CHANGED: bool, } -DATA_SCHEMA = {**CHECKSUMS_SCHEMA, Required("path"): str} +DATA_SCHEMA = { + **CHECKSUMS_SCHEMA, + Required("path"): str, + HashInfo.PARAM_SIZE: int, + HashInfo.PARAM_NFILES: int, +} LOCK_FILE_STAGE_SCHEMA = { Required(StageParams.PARAM_CMD): str, StageParams.PARAM_DEPS: [DATA_SCHEMA], diff --git a/dvc/stage/cache.py b/dvc/stage/cache.py index 0482e01bc0..e7c5b721ac 100644 --- a/dvc/stage/cache.py +++ b/dvc/stage/cache.py @@ -25,9 +25,13 @@ def __init__(self, stage): def _get_cache_hash(cache, key=False): + from dvc.hash_info import HashInfo + if key: cache["outs"] = [out["path"] for out in cache.get("outs", [])] - return dict_sha256(cache) + return dict_sha256( + cache, exclude=[HashInfo.PARAM_SIZE, HashInfo.PARAM_NFILES] + ) def _get_stage_hash(stage): diff --git a/dvc/stage/serialize.py b/dvc/stage/serialize.py index cf0fc25e1d..46c2320bab 100644 --- a/dvc/stage/serialize.py +++ b/dvc/stage/serialize.py @@ -134,18 +134,18 @@ def to_pipeline_file(stage: "PipelineStage"): def to_single_stage_lockfile(stage: "Stage") -> dict: assert stage.cmd + def _dumpd(item): + ret = [ + (item.PARAM_PATH, item.def_path), + *item.hash_info.to_dict().items(), + ] + + return OrderedDict(ret) + res = OrderedDict([("cmd", stage.cmd)]) params, deps = split_params_deps(stage) deps, outs = [ - [ - OrderedDict( - [ - (PARAM_PATH, item.def_path), - *item.hash_info.to_dict().items(), - ] - ) - for item in sort_by_path(items) - ] + [_dumpd(item) for item in sort_by_path(items)] for items in [deps, stage.outs] ] params = _serialize_params_values(params) diff --git a/dvc/stage/utils.py b/dvc/stage/utils.py index feb2a348fb..ff94d83e4b 100644 --- a/dvc/stage/utils.py +++ b/dvc/stage/utils.py @@ -8,6 +8,7 @@ from dvc.utils.fs import path_isin from ..dependency import ParamsDependency +from ..hash_info import HashInfo from ..tree.local import LocalTree from ..tree.s3 import S3Tree from ..utils import dict_md5, format_link, relpath @@ -136,6 +137,8 @@ def stage_dump_eq(stage_cls, old_d, new_d): for out in outs: out.pop(LocalTree.PARAM_CHECKSUM, None) out.pop(S3Tree.PARAM_CHECKSUM, None) + out.pop(HashInfo.PARAM_SIZE, None) + out.pop(HashInfo.PARAM_NFILES, None) # outs and deps are lists of dicts. To check equality, we need to make # them independent of the order, so, we convert them to dicts. @@ -171,6 +174,8 @@ def compute_md5(stage): stage.PARAM_FROZEN, BaseOutput.PARAM_METRIC, BaseOutput.PARAM_PERSIST, + HashInfo.PARAM_SIZE, + HashInfo.PARAM_NFILES, ], ) diff --git a/dvc/state.py b/dvc/state.py index 4009c8c7ef..82f1f02bc0 100644 --- a/dvc/state.py +++ b/dvc/state.py @@ -8,6 +8,7 @@ from urllib.parse import urlencode, urlunparse from dvc.exceptions import DvcException +from dvc.hash_info import HashInfo from dvc.utils import current_timestamp, relpath, to_chunks from dvc.utils.fs import get_inode, get_mtime_and_size, remove @@ -40,7 +41,7 @@ def files(self): pass @abstractmethod - def save(self, path_info, checksum): + def save(self, path_info, hash_info): pass @abstractmethod @@ -76,7 +77,7 @@ class StateNoop(StateBase): def files(self): return [] - def save(self, path_info, checksum): + def save(self, path_info, hash_info): pass def get(self, path_info): # pylint: disable=unused-argument @@ -396,15 +397,16 @@ def get_state_record_for_inode(self, inode): return results[0] return None - def save(self, path_info, checksum): - """Save checksum for the specified path info. + def save(self, path_info, hash_info): + """Save hash for the specified path info. Args: - path_info (dict): path_info to save checksum for. - checksum (str): checksum to save. + path_info (dict): path_info to save hash for. + hash_info (HashInfo): hash to save. """ assert isinstance(path_info, str) or path_info.scheme == "local" - assert checksum is not None + assert hash_info + assert isinstance(hash_info, HashInfo) assert os.path.exists(path_info) actual_mtime, actual_size = get_mtime_and_size(path_info, self.tree) @@ -413,23 +415,23 @@ def save(self, path_info, checksum): existing_record = self.get_state_record_for_inode(actual_inode) if not existing_record: self._insert_new_state_record( - actual_inode, actual_mtime, actual_size, checksum + actual_inode, actual_mtime, actual_size, hash_info.value ) return self._update_state_for_path_changed( - actual_inode, actual_mtime, actual_size, checksum + actual_inode, actual_mtime, actual_size, hash_info.value ) def get(self, path_info): - """Gets the checksum for the specified path info. Checksum will be + """Gets the hash for the specified path info. Hash will be retrieved from the state database if available. Args: - path_info (dict): path info to get the checksum for. + path_info (dict): path info to get the hash for. Returns: - str or None: checksum for the specified path info or None if it + HashInfo or None: hash for the specified path info or None if it doesn't exist in the state database. """ assert isinstance(path_info, str) or path_info.scheme == "local" @@ -448,12 +450,12 @@ def get(self, path_info): if not existing_record: return None - mtime, size, checksum, _ = existing_record + mtime, size, value, _ = existing_record if self._file_metadata_changed(actual_mtime, mtime, actual_size, size): return None self._update_state_record_timestamp_for_inode(actual_inode) - return checksum + return HashInfo("md5", value, size=int(actual_size)) def save_link(self, path_info): """Adds the specified path to the list of links created by dvc. This diff --git a/dvc/tree/base.py b/dvc/tree/base.py index 6fc2bd3615..b4d4c8556a 100644 --- a/dvc/tree/base.py +++ b/dvc/tree/base.py @@ -8,7 +8,6 @@ from funcy import cached_property, decorator from dvc.exceptions import DvcException, DvcIgnoreInCollectedDirError -from dvc.hash_info import HashInfo from dvc.ignore import DvcIgnore from dvc.path_info import URLInfo from dvc.progress import Tqdm @@ -247,22 +246,22 @@ def get_hash(self, path_info, **kwargs): return None # pylint: disable=assignment-from-none - hash_ = self.state.get(path_info) + hash_info = self.state.get(path_info) # If we have dir hash in state db, but dir cache file is lost, # then we need to recollect the dir via .get_dir_hash() call below, # see https://github.com/iterative/dvc/issues/2219 for context if ( - hash_ - and self.is_dir_hash(hash_) + hash_info + and hash_info.isdir and not self.cache.tree.exists( - self.cache.tree.hash_to_path_info(hash_) + self.cache.tree.hash_to_path_info(hash_info.value) ) ): - hash_ = None + hash_info = None - if hash_: - hash_info = HashInfo(self.PARAM_CHECKSUM, hash_) + if hash_info: + assert hash_info.name == self.PARAM_CHECKSUM if hash_info.isdir: self.cache.set_dir_info(hash_info) return hash_info @@ -273,7 +272,7 @@ def get_hash(self, path_info, **kwargs): hash_info = self.get_file_hash(path_info) if hash_info and self.exists(path_info): - self.state.save(path_info, hash_info.value) + self.state.save(path_info, hash_info) return hash_info @@ -300,10 +299,11 @@ def _calculate_hashes(self, file_infos): ) as pbar: worker = pbar.wrap_fn(self.get_file_hash) with ThreadPoolExecutor(max_workers=self.hash_jobs) as executor: - hashes = (hi.value for hi in executor.map(worker, file_infos)) - return dict(zip(file_infos, hashes)) + hash_infos = executor.map(worker, file_infos) + return dict(zip(file_infos, hash_infos)) def _collect_dir(self, path_info, **kwargs): + file_infos = set() for fname in self.walk_files(path_info, **kwargs): @@ -312,15 +312,21 @@ def _collect_dir(self, path_info, **kwargs): file_infos.add(fname) - hashes = {fi: self.state.get(fi) for fi in file_infos} - not_in_state = {fi for fi, hash_ in hashes.items() if hash_ is None} + hash_infos = {fi: self.state.get(fi) for fi in file_infos} + not_in_state = {fi for fi, hi in hash_infos.items() if hi is None} + + new_hash_infos = self._calculate_hashes(not_in_state) + hash_infos.update(new_hash_infos) - new_hashes = self._calculate_hashes(not_in_state) - hashes.update(new_hashes) + sizes = [hi.size for hi in hash_infos.values()] + if None in sizes: + size = None + else: + size = sum(sizes) - return [ + dir_info = [ { - self.PARAM_CHECKSUM: hashes[fi], + self.PARAM_CHECKSUM: hash_infos[fi].value, # NOTE: this is lossy transformation: # "hey\there" -> "hey/there" # "hey/there" -> "hey/there" @@ -334,10 +340,14 @@ def _collect_dir(self, path_info, **kwargs): for fi in file_infos ] + return (dir_info, size) + @use_state def get_dir_hash(self, path_info, **kwargs): - dir_info = self._collect_dir(path_info, **kwargs) - return self.repo.cache.local.save_dir_info(dir_info) + dir_info, size = self._collect_dir(path_info, **kwargs) + hash_info = self.repo.cache.local.save_dir_info(dir_info) + hash_info.size = size + return hash_info def upload(self, from_info, to_info, name=None, no_progress_bar=False): if not hasattr(self, "_upload"): diff --git a/dvc/tree/hdfs.py b/dvc/tree/hdfs.py index a57493c771..4ea629e9e6 100644 --- a/dvc/tree/hdfs.py +++ b/dvc/tree/hdfs.py @@ -177,10 +177,15 @@ def get_file_hash(self, path_info): stdout = self.hadoop_fs( f"checksum {path_info.url}", user=path_info.user ) - return HashInfo( - self.PARAM_CHECKSUM, self._group(regex, stdout, "checksum") + hash_info = HashInfo( + self.PARAM_CHECKSUM, self._group(regex, stdout, "checksum"), ) + with self.hdfs(path_info) as hdfs: + hash_info.size = hdfs.info(path_info.path)["size"] + + return hash_info + def _upload( self, from_file, to_info, name=None, no_progress_bar=False, **_kwargs ): diff --git a/dvc/tree/local.py b/dvc/tree/local.py index c457655838..ce56cc78c4 100644 --- a/dvc/tree/local.py +++ b/dvc/tree/local.py @@ -312,7 +312,12 @@ def is_protected(self, path_info): return stat.S_IMODE(mode) == self.CACHE_MODE def get_file_hash(self, path_info): - return HashInfo(self.PARAM_CHECKSUM, file_md5(path_info)[0]) + hash_info = HashInfo(self.PARAM_CHECKSUM, file_md5(path_info)[0],) + + if hash_info: + hash_info.size = os.path.getsize(path_info) + + return hash_info @staticmethod def getsize(path_info): diff --git a/dvc/tree/s3.py b/dvc/tree/s3.py index a48332c79d..0373cc1bbe 100644 --- a/dvc/tree/s3.py +++ b/dvc/tree/s3.py @@ -336,7 +336,11 @@ def _copy(cls, s3, from_info, to_info, extra_args): def get_file_hash(self, path_info): with self._get_obj(path_info) as obj: - return HashInfo(self.PARAM_CHECKSUM, obj.e_tag.strip('"')) + return HashInfo( + self.PARAM_CHECKSUM, + obj.e_tag.strip('"'), + size=obj.content_length, + ) def _upload(self, from_file, to_info, name=None, no_progress_bar=False): with self._get_obj(to_info) as obj: diff --git a/dvc/tree/ssh/__init__.py b/dvc/tree/ssh/__init__.py index d6ba6a89af..1af9a88cab 100644 --- a/dvc/tree/ssh/__init__.py +++ b/dvc/tree/ssh/__init__.py @@ -241,7 +241,12 @@ def get_file_hash(self, path_info): raise NotImplementedError with self.ssh(path_info) as ssh: - return HashInfo(self.PARAM_CHECKSUM, ssh.md5(path_info.path)) + hash_info = HashInfo(self.PARAM_CHECKSUM, ssh.md5(path_info.path),) + + if hash_info: + hash_info.size = ssh.getsize(path_info.path) + + return hash_info def getsize(self, path_info): with self.ssh(path_info) as ssh: diff --git a/tests/func/test_add.py b/tests/func/test_add.py index ed05a9f704..93f63f73fb 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -46,7 +46,13 @@ def test_add(tmp_dir, dvc): assert stage.md5 is None assert load_yaml("foo.dvc") == { - "outs": [{"md5": "acbd18db4cc2f85cedef654fccc4a4d8", "path": "foo"}], + "outs": [ + { + "md5": "acbd18db4cc2f85cedef654fccc4a4d8", + "path": "foo", + "size": 3, + } + ], } @@ -226,6 +232,7 @@ def test_add_external_file(tmp_dir, dvc, workspace, hash_name, hash_value): assert (tmp_dir / "file.dvc").read_text() == ( "outs:\n" f"- {hash_name}: {hash_value}\n" + " size: 4\n" " path: remote://workspace/file\n" ) assert (workspace / "file").read_text() == "file" @@ -272,6 +279,8 @@ def test_add_external_dir(tmp_dir, dvc, workspace, hash_name, hash_value): assert (tmp_dir / "dir.dvc").read_text() == ( "outs:\n" f"- {hash_name}: {hash_value}\n" + " size: 11\n" + " nfiles: 2\n" " path: remote://workspace/dir\n" ) assert (workspace / "cache" / hash_value[:2] / hash_value[2:]).is_file() diff --git a/tests/func/test_import_url.py b/tests/func/test_import_url.py index 4af804deb4..f1eb4b83fe 100644 --- a/tests/func/test_import_url.py +++ b/tests/func/test_import_url.py @@ -195,9 +195,13 @@ def test_import_url_dir(tmp_dir, dvc, workspace, stage_md5, dir_md5): "frozen: true\n" "deps:\n" f"- md5: {dir_md5}\n" + " size: 11\n" + " nfiles: 2\n" " path: remote://workspace/dir\n" "outs:\n" "- md5: b6dcab6ccd17ca0a8bf4a215a37d14cc.dir\n" + " size: 11\n" + " nfiles: 2\n" " path: dir\n" ) diff --git a/tests/func/test_install.py b/tests/func/test_install.py index 87e61f16f5..4ecd37487e 100644 --- a/tests/func/test_install.py +++ b/tests/func/test_install.py @@ -104,6 +104,8 @@ def test_merge_driver_no_ancestor(tmp_dir, scm, dvc): assert (tmp_dir / "data.dvc").read_text() == ( "outs:\n" "- md5: 5ea40360f5b4ec688df672a4db9c17d1.dir\n" + " size: 6\n" + " nfiles: 2\n" " path: data\n" ) @@ -137,6 +139,8 @@ def test_merge_driver(tmp_dir, scm, dvc): assert (tmp_dir / "data.dvc").read_text() == ( "outs:\n" "- md5: 839ef9371606817569c1ee0e5f4ed233.dir\n" + " size: 12\n" + " nfiles: 3\n" " path: data\n" ) diff --git a/tests/func/test_lockfile.py b/tests/func/test_lockfile.py index c1de074b0e..c75fde2afd 100644 --- a/tests/func/test_lockfile.py +++ b/tests/func/test_lockfile.py @@ -43,8 +43,12 @@ def test_deps_outs_are_sorted_by_path(tmp_dir, dvc, run_head): assert list(lock.keys()) == ["cmd", "deps", "outs"] # `path` key appear first and then the `md5` - assert all(list(dep.keys()) == ["path", "md5"] for dep in lock["deps"]) - assert all(list(out.keys()) == ["path", "md5"] for out in lock["outs"]) + assert all( + list(dep.keys()) == ["path", "md5", "size"] for dep in lock["deps"] + ) + assert all( + list(out.keys()) == ["path", "md5", "size"] for out in lock["outs"] + ) # deps are always sorted by the file path naming assert list(map(itemgetter("path"), lock["deps"])) == sorted(deps) diff --git a/tests/func/test_remote.py b/tests/func/test_remote.py index e98e81ebe8..a779b02104 100644 --- a/tests/func/test_remote.py +++ b/tests/func/test_remote.py @@ -152,20 +152,20 @@ def test_dir_hash_should_be_key_order_agnostic(tmp_dir, dvc): with patch.object( BaseTree, "_collect_dir", - return_value=[ - {"relpath": "1", "md5": "1"}, - {"relpath": "2", "md5": "2"}, - ], + return_value=( + [{"relpath": "1", "md5": "1"}, {"relpath": "2", "md5": "2"}], + None, + ), ): hash1 = dvc.cache.local.tree.get_hash(path_info) with patch.object( BaseTree, "_collect_dir", - return_value=[ - {"md5": "1", "relpath": "1"}, - {"md5": "2", "relpath": "2"}, - ], + return_value=( + [{"md5": "1", "relpath": "1"}, {"md5": "2", "relpath": "2"}], + None, + ), ): hash2 = dvc.cache.local.tree.get_hash(path_info) diff --git a/tests/func/test_repro.py b/tests/func/test_repro.py index 26d3530822..bd414b2901 100644 --- a/tests/func/test_repro.py +++ b/tests/func/test_repro.py @@ -1183,7 +1183,16 @@ def test_dvc_formatting_retained(tmp_dir, dvc, run_copy): (tmp_dir / "foo").write_text("new foo") dvc.reproduce("foo_copy.dvc", force=True) - assert _hide_md5(stage_text) == _hide_md5(stage_path.read_text()) + def _hide_md5(text): + return re.sub(r"\b[a-f0-9]{32}\b", "", text) + + def _hide_size(text): + return re.sub(r"size: [0-9]*\b", "size: ", text) + + def _mask(text): + return _hide_size(_hide_md5(text)) + + assert _mask(stage_text) == _mask(stage_path.read_text()) def _format_dvc_line(line): @@ -1198,10 +1207,6 @@ def _format_dvc_line(line): return line -def _hide_md5(text): - return re.sub(r"\b[a-f0-9]{32}\b", "", text) - - def test_downstream(dvc): # The dependency graph should look like this: # diff --git a/tests/func/test_run_multistage.py b/tests/func/test_run_multistage.py index e7d49a63da..98094b7e1d 100644 --- a/tests/func/test_run_multistage.py +++ b/tests/func/test_run_multistage.py @@ -429,9 +429,11 @@ def test_run_external_outputs( " deps:\n" " - path: remote://workspace/foo\n" f" {hash_name}: {foo_hash}\n" + " size: 3\n" " outs:\n" " - path: remote://workspace/bar\n" f" {hash_name}: {bar_hash}\n" + " size: 3\n" ) assert (workspace / "foo").read_text() == "foo" diff --git a/tests/func/test_state.py b/tests/func/test_state.py index ab1c8f745e..888d01e4ae 100644 --- a/tests/func/test_state.py +++ b/tests/func/test_state.py @@ -2,6 +2,7 @@ import mock +from dvc.hash_info import HashInfo from dvc.path_info import PathInfo from dvc.state import State from dvc.utils import file_md5 @@ -11,26 +12,23 @@ def test_state(tmp_dir, dvc): tmp_dir.gen("foo", "foo content") path = tmp_dir / "foo" path_info = PathInfo(path) - md5 = file_md5(path)[0] + hash_info = HashInfo("md5", file_md5(path)[0]) state = State(dvc) with state: - state.save(path_info, md5) - entry_md5 = state.get(path_info) - assert entry_md5 == md5 + state.save(path_info, hash_info) + assert state.get(path_info) == hash_info path.unlink() path.write_text("1") - entry_md5 = state.get(path_info) - assert entry_md5 is None + assert state.get(path_info) is None - md5 = file_md5(path)[0] - state.save(path_info, md5) + hash_info = HashInfo("md5", file_md5(path)[0]) + state.save(path_info, hash_info) - entry_md5 = state.get(path_info) - assert entry_md5 == md5 + assert state.get(path_info) == hash_info def test_state_overflow(tmp_dir, dvc): @@ -66,7 +64,7 @@ def test_get_state_record_for_inode(get_inode_mock, tmp_dir, dvc): get_inode_mock.side_effect = mock_get_inode(inode) with state: - state.save(PathInfo(foo), md5) + state.save(PathInfo(foo), HashInfo("md5", md5)) ret = state.get_state_record_for_inode(inode) assert ret is not None