From 3f5757a0a5b170f6a1015b324920ddf4e6219612 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Thu, 12 May 2022 21:40:45 +0545 Subject: [PATCH] dvc.objects: move hashing functions to dvc.objects This commit moves `HashedStreamReader`/`get_hash_file`/`file_md5` into `dvc.objects.hash`. The `get_hash_file` has been renamed to `hash_file`. This also copies `relpath` and `is_exec` to the `dvc.fs.utils`. --- dvc/data/stage.py | 59 +----------- dvc/{utils => data}/stream.py | 2 +- dvc/data/tree.py | 5 +- dvc/fs/_callback.py | 18 +++- dvc/fs/base.py | 28 +++++- dvc/fs/dvc.py | 2 +- dvc/fs/http.py | 4 +- dvc/fs/utils.py | 31 +++++++ dvc/objects/cache.py | 2 +- dvc/objects/file.py | 6 +- dvc/objects/hash.py | 136 ++++++++++++++++++++++++++++ dvc/objects/state.py | 2 +- dvc/utils/__init__.py | 60 +----------- tests/func/objects/test_hash.py | 16 ++++ tests/func/test_add.py | 7 +- tests/func/test_data_cloud.py | 4 +- tests/func/test_install.py | 2 +- tests/func/test_repro.py | 3 +- tests/func/test_run_single_stage.py | 2 +- tests/func/test_state.py | 2 +- tests/func/test_utils.py | 8 -- tests/unit/fs/test_data.py | 4 +- tests/unit/fs/test_dvc.py | 8 +- tests/unit/utils/test_stream.py | 4 +- tests/unit/utils/test_utils.py | 9 -- 25 files changed, 257 insertions(+), 167 deletions(-) rename dvc/{utils => data}/stream.py (96%) create mode 100644 dvc/fs/utils.py create mode 100644 dvc/objects/hash.py create mode 100644 tests/func/objects/test_hash.py diff --git a/dvc/data/stage.py b/dvc/data/stage.py index 280b530262..9879e2f4a7 100644 --- a/dvc/data/stage.py +++ b/dvc/data/stage.py @@ -9,10 +9,10 @@ from dvc.exceptions import DvcIgnoreInCollectedDirError from dvc.ignore import DvcIgnore from dvc.objects.file import HashFile +from dvc.objects.hash import hash_file from dvc.objects.hash_info import HashInfo from dvc.objects.meta import Meta from dvc.progress import Tqdm -from dvc.utils import file_md5, is_exec from .db.reference import ReferenceObjectDB @@ -32,7 +32,8 @@ def _upload_file(from_fs_path, fs, odb, upload_odb, callback=None): from dvc.fs._callback import FsspecCallback from dvc.utils import tmp_fname - from dvc.utils.stream import HashedStreamReader + + from .stream import HashedStreamReader fs_path = upload_odb.fs.path tmp_info = fs_path.join(upload_odb.fs_path, tmp_fname()) @@ -52,59 +53,9 @@ def _upload_file(from_fs_path, fs, odb, upload_odb, callback=None): return from_fs_path, meta, odb.get(stream.hash_info) -def _adapt_info(info, scheme): - if scheme == "s3" and "ETag" in info: - info["etag"] = info["ETag"].strip('"') - elif scheme == "gs" and "etag" in info: - import base64 - - info["etag"] = base64.b64decode(info["etag"]).hex() - elif scheme.startswith("http") and ( - "ETag" in info or "Content-MD5" in info - ): - info["checksum"] = info.get("ETag") or info.get("Content-MD5") - return info - - -def _get_file_hash(fs_path, fs, name): - info = _adapt_info(fs.info(fs_path), fs.scheme) - - if name in info: - assert not info[name].endswith(".dir") - hash_value = info[name] - elif hasattr(fs, name): - func = getattr(fs, name) - hash_value = func(fs_path) - elif name == "md5": - hash_value = file_md5(fs_path, fs) - else: - raise NotImplementedError - - meta = Meta(size=info["size"], isexec=is_exec(info.get("mode", 0))) - hash_info = HashInfo(name, hash_value) - return meta, hash_info - - -def get_file_hash(fs_path, fs, name, state=None): - if state: - meta, hash_info = state.get( # pylint: disable=assignment-from-none - fs_path, fs - ) - if hash_info: - return meta, hash_info - - meta, hash_info = _get_file_hash(fs_path, fs, name) - - if state: - assert ".dir" not in hash_info.value - state.save(fs_path, fs, hash_info) - - return meta, hash_info - - def _stage_file(fs_path, fs, name, odb=None, upload_odb=None, dry_run=False): state = odb.state if odb else None - meta, hash_info = get_file_hash(fs_path, fs, name, state=state) + meta, hash_info = hash_file(fs_path, fs, name, state=state) if upload_odb and not dry_run: assert odb and name == "md5" return _upload_file(fs_path, fs, odb, upload_odb) @@ -346,7 +297,7 @@ def _stage_external_tree_info(odb, tree, name): odb.add(tree.fs_path, tree.fs, tree.hash_info) raw = odb.get(tree.hash_info) - _, hash_info = get_file_hash(raw.fs_path, raw.fs, name, state=odb.state) + _, hash_info = hash_file(raw.fs_path, raw.fs, name, state=odb.state) tree.fs_path = raw.fs_path tree.fs = raw.fs tree.hash_info.name = hash_info.name diff --git a/dvc/utils/stream.py b/dvc/data/stream.py similarity index 96% rename from dvc/utils/stream.py rename to dvc/data/stream.py index af5d68b1cb..c082c1f5fb 100644 --- a/dvc/utils/stream.py +++ b/dvc/data/stream.py @@ -3,9 +3,9 @@ from funcy import cached_property +from dvc.objects.hash import dos2unix from dvc.objects.hash_info import HashInfo from dvc.objects.istextfile import DEFAULT_CHUNK_SIZE, istextblock -from dvc.utils import dos2unix class HashedStreamReader(io.IOBase): diff --git a/dvc/data/tree.py b/dvc/data/tree.py index 10f3adc0df..90466ee138 100644 --- a/dvc/data/tree.py +++ b/dvc/data/tree.py @@ -7,8 +7,7 @@ from dvc.objects.errors import ObjectFormatError from dvc.objects.file import HashFile - -from .stage import get_file_hash +from dvc.objects.hash import hash_file if TYPE_CHECKING: from dvc.objects.db import ObjectDB @@ -67,7 +66,7 @@ def digest(self, hash_info: Optional["HashInfo"] = None): if hash_info: self.hash_info = hash_info else: - _, self.hash_info = get_file_hash(fs_path, memfs, "md5") + _, self.hash_info = hash_file(fs_path, memfs, "md5") assert self.hash_info.value self.hash_info.value += ".dir" diff --git a/dvc/fs/_callback.py b/dvc/fs/_callback.py index 8eff340dd8..2e19aff66f 100644 --- a/dvc/fs/_callback.py +++ b/dvc/fs/_callback.py @@ -1,12 +1,12 @@ from contextlib import ExitStack from functools import wraps -from typing import IO, TYPE_CHECKING, Any, Dict, Optional, TypeVar, cast +from typing import TYPE_CHECKING, Any, Dict, Optional, TypeVar, overload import fsspec from funcy import cached_property if TYPE_CHECKING: - from typing import Callable + from typing import BinaryIO, Callable, TextIO, Union from typing_extensions import ParamSpec @@ -20,11 +20,21 @@ class FsspecCallback(fsspec.Callback): """FsspecCallback usable as a context manager, and a few helper methods.""" - def wrap_attr(self, fobj: IO, method: str = "read") -> IO: + @overload + def wrap_attr(self, fobj: "BinaryIO", method: str = "read") -> "BinaryIO": + ... + + @overload + def wrap_attr(self, fobj: "TextIO", method: str = "read") -> "TextIO": + ... + + def wrap_attr( + self, fobj: "Union[TextIO, BinaryIO]", method: str = "read" + ) -> "Union[TextIO, BinaryIO]": from tqdm.utils import CallbackIOWrapper wrapped = CallbackIOWrapper(self.relative_update, fobj, method) - return cast(IO, wrapped) + return wrapped def wrap_fn(self, fn: "Callable[_P, _R]") -> "Callable[_P, _R]": @wraps(fn) diff --git a/dvc/fs/base.py b/dvc/fs/base.py index c733f39eef..03efbe7735 100644 --- a/dvc/fs/base.py +++ b/dvc/fs/base.py @@ -25,6 +25,8 @@ from ._callback import DEFAULT_CALLBACK, FsspecCallback if TYPE_CHECKING: + from typing import BinaryIO, TextIO + from fsspec.spec import AbstractFileSystem from typing_extensions import Literal @@ -179,12 +181,30 @@ def is_empty(self, path: AnyFSPath) -> bool: return not self.fs.ls(path) return entry["size"] == 0 + @overload + def open( + self, + path: AnyFSPath, + mode: "Literal['rb', 'br', 'wb']", + **kwargs: Any, + ) -> "BinaryIO": # pylint: disable=arguments-differ + return self.open(path, mode, **kwargs) + + @overload + def open( + self, + path: AnyFSPath, + mode: "Literal['r', 'rt', 'w']", + **kwargs: Any, + ) -> "TextIO": # pylint: disable=arguments-differ + ... + def open( self, path: AnyFSPath, mode: str = "r", - **kwargs, - ) -> "IO": # pylint: disable=arguments-differ + **kwargs: Any, + ) -> "IO[Any]": # pylint: disable=arguments-differ if "b" in mode: kwargs.pop("encoding", None) return self.fs.open(path, mode=mode, **kwargs) @@ -354,7 +374,7 @@ def makedirs(self, path: AnyFSPath, **kwargs: Any) -> None: def put_file( self, - from_file: Union[AnyFSPath, IO], + from_file: Union[AnyFSPath, "BinaryIO"], to_info: AnyFSPath, callback: FsspecCallback = DEFAULT_CALLBACK, size: int = None, @@ -363,7 +383,7 @@ def put_file( if size: callback.set_size(size) if hasattr(from_file, "read"): - stream = callback.wrap_attr(cast("IO", from_file)) + stream = callback.wrap_attr(cast("BinaryIO", from_file)) self.upload_fobj(stream, to_info, size=size) else: assert isinstance(from_file, str) diff --git a/dvc/fs/dvc.py b/dvc/fs/dvc.py index 5a6aeff4ca..b3251f5f31 100644 --- a/dvc/fs/dvc.py +++ b/dvc/fs/dvc.py @@ -41,7 +41,7 @@ def _ls(fs, path): def _merge_info(repo, fs_info, dvc_info): - from dvc.utils import is_exec + from dvc.fs.utils import is_exec ret = {"repo": repo} diff --git a/dvc/fs/http.py b/dvc/fs/http.py index f17930e7c3..c0c9530ad9 100644 --- a/dvc/fs/http.py +++ b/dvc/fs/http.py @@ -1,5 +1,5 @@ import threading -from typing import IO, Union +from typing import BinaryIO, Union from funcy import cached_property, memoize, wrap_with @@ -141,7 +141,7 @@ def unstrip_protocol(self, path: str) -> str: def put_file( self, - from_file: Union[AnyFSPath, IO], + from_file: Union[AnyFSPath, BinaryIO], to_info: AnyFSPath, callback: FsspecCallback = DEFAULT_CALLBACK, size: int = None, diff --git a/dvc/fs/utils.py b/dvc/fs/utils.py new file mode 100644 index 0000000000..5ab5aa34d9 --- /dev/null +++ b/dvc/fs/utils.py @@ -0,0 +1,31 @@ +import os +import stat +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from .base import AnyFSPath + + +def is_exec(mode: int) -> bool: + return bool(mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)) + + +def relpath(path: "AnyFSPath", start: "AnyFSPath" = os.curdir) -> "AnyFSPath": + path = os.fspath(path) + start = os.path.abspath(os.fspath(start)) + + # Windows path on different drive than curdir doesn't have relpath + if os.name == "nt": + # Since python 3.8 os.realpath resolves network shares to their UNC + # path. So, to be certain that relative paths correctly captured, + # we need to resolve to UNC path first. We resolve only the drive + # name so that we don't follow any 'real' symlinks on the path + def resolve_network_drive_windows(path_to_resolve): + drive, tail = os.path.splitdrive(path_to_resolve) + return os.path.join(os.path.realpath(drive), tail) + + path = resolve_network_drive_windows(os.path.abspath(path)) + start = resolve_network_drive_windows(start) + if not os.path.commonprefix([start, path]): + return path + return os.path.relpath(path, start) diff --git a/dvc/objects/cache.py b/dvc/objects/cache.py index 175fb91c25..0aa92d2c47 100644 --- a/dvc/objects/cache.py +++ b/dvc/objects/cache.py @@ -57,5 +57,5 @@ def __init__( ) -> None: settings.setdefault("disk_pickle_protocol", 4) super().__init__( - directory=directory, timeout=timeout, disk=Disk, **settings + directory=directory, timeout=timeout, disk=disk, **settings ) diff --git a/dvc/objects/file.py b/dvc/objects/file.py index aa62a85d49..a1575f98bd 100644 --- a/dvc/objects/file.py +++ b/dvc/objects/file.py @@ -65,10 +65,10 @@ def check(self, odb: "ObjectDB", check_hash: bool = True): self._check_hash(odb) def _check_hash(self, odb): - from dvc.data.stage import get_file_hash - from dvc.objects.errors import ObjectFormatError + from .errors import ObjectFormatError + from .hash import hash_file - _, actual = get_file_hash( + _, actual = hash_file( self.fs_path, self.fs, self.hash_info.name, odb.state ) diff --git a/dvc/objects/hash.py b/dvc/objects/hash.py new file mode 100644 index 0000000000..be8effc60c --- /dev/null +++ b/dvc/objects/hash.py @@ -0,0 +1,136 @@ +import hashlib +import logging +from typing import TYPE_CHECKING, Any, BinaryIO, Dict, Tuple + +from dvc.fs._callback import DEFAULT_CALLBACK, FsspecCallback, TqdmCallback +from dvc.fs.utils import is_exec + +from .hash_info import HashInfo +from .istextfile import istextfile +from .meta import Meta + +logger = logging.getLogger(__name__) + + +if TYPE_CHECKING: + from dvc.fs.base import AnyFSPath, FileSystem + + from .state import StateBase + + +def _adapt_info(info: Dict[str, Any], scheme: str) -> Dict[str, Any]: + if scheme == "s3" and "ETag" in info: + info["etag"] = info["ETag"].strip('"') + elif scheme == "gs" and "etag" in info: + import base64 + + info["etag"] = base64.b64decode(info["etag"]).hex() + elif scheme.startswith("http") and ( + "ETag" in info or "Content-MD5" in info + ): + info["checksum"] = info.get("ETag") or info.get("Content-MD5") + return info + + +def dos2unix(data: bytes) -> bytes: + return data.replace(b"\r\n", b"\n") + + +def _fobj_md5( + fobj: BinaryIO, + hash_md5: "hashlib._Hash", + binary: bool, + chunk_size: int = 2**20, +) -> None: + while True: + data = fobj.read(chunk_size) + if not data: + break + chunk = data if binary else dos2unix(data) + hash_md5.update(chunk) + + +def file_md5( + fname: "AnyFSPath", + fs: "FileSystem", + callback: "FsspecCallback" = DEFAULT_CALLBACK, +) -> str: + """get the (md5 hexdigest, md5 digest) of a file""" + + hash_md5 = hashlib.md5() + binary = not istextfile(fname, fs=fs) + size = fs.size(fname) or 0 + callback.set_size(size) + with fs.open(fname, "rb") as fobj: + _fobj_md5(callback.wrap_attr(fobj), hash_md5, binary) + return hash_md5.hexdigest() + + +def _hash_file( + fs_path: "AnyFSPath", + fs: "FileSystem", + name: str, + callback: "FsspecCallback" = DEFAULT_CALLBACK, +) -> Tuple["str", Dict[str, Any]]: + info = _adapt_info(fs.info(fs_path), fs.scheme) + + if name in info: + assert not info[name].endswith(".dir") + return info[name], info + + if hasattr(fs, name): + func = getattr(fs, name) + return func(fs_path), info + + if name == "md5": + return file_md5(fs_path, fs, callback=callback), info + raise NotImplementedError + + +class LargeFileHashingCallback(TqdmCallback): + """Callback that only shows progress bar if self.size > LARGE_FILE_SIZE.""" + + LARGE_FILE_SIZE = 2**30 + + def __init__(self, *args, **kwargs): + kwargs.setdefault("bytes", True) + super().__init__(*args, **kwargs) + self._logged = False + + # TqdmCallback force renders progress bar on `set_size`. + set_size = FsspecCallback.set_size + + def call(self, hook_name=None, **kwargs): + if self.size and self.size > self.LARGE_FILE_SIZE: + if not self._logged: + desc = self.progress_bar.desc + logger.info( + f"Computing md5 for a large file '{desc}'. " + "This is only done once." + ) + self._logged = True + super().call() + + +def hash_file( + fs_path: "AnyFSPath", + fs: "FileSystem", + name: str, + state: "StateBase" = None, + callback: "FsspecCallback" = None, +) -> Tuple["Meta", "HashInfo"]: + if state: + meta, hash_info = state.get(fs_path, fs) + if hash_info: + return meta, hash_info + + cb = callback or LargeFileHashingCallback(desc=fs_path) + with cb: + hash_value, info = _hash_file(fs_path, fs, name, callback=cb) + hash_info = HashInfo(name, hash_value) + if state: + assert ".dir" not in hash_info.value + state.save(fs_path, fs, hash_info) + + meta = Meta(size=info["size"], isexec=is_exec(info.get("mode", 0))) + return meta, hash_info diff --git a/dvc/objects/state.py b/dvc/objects/state.py index fb16acdeee..de4a55d61a 100644 --- a/dvc/objects/state.py +++ b/dvc/objects/state.py @@ -6,7 +6,7 @@ from dvc.fs.local import LocalFileSystem from dvc.fs.system import inode as get_inode -from dvc.utils import relpath +from dvc.fs.utils import relpath from .hash_info import HashInfo from .utils import get_mtime_and_size diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index d75edb84a9..a410e4b644 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -6,7 +6,6 @@ import math import os import re -import stat import sys import time from typing import Dict, List, Optional, Tuple @@ -15,62 +14,10 @@ logger = logging.getLogger(__name__) -LOCAL_CHUNK_SIZE = 2**20 # 1 MB -LARGE_FILE_SIZE = 2**30 # 1 GB LARGE_DIR_SIZE = 100 TARGET_REGEX = re.compile(r"(?P.*?)(:(?P[^\\/:]*))??$") -def dos2unix(data): - return data.replace(b"\r\n", b"\n") - - -def _fobj_md5(fobj, hash_md5, binary, progress_func=None): - while True: - data = fobj.read(LOCAL_CHUNK_SIZE) - if not data: - break - - if binary: - chunk = data - else: - chunk = dos2unix(data) - - hash_md5.update(chunk) - if progress_func: - progress_func(len(data)) - - -def file_md5(fname, fs): - """get the (md5 hexdigest, md5 digest) of a file""" - from dvc.objects.istextfile import istextfile - from dvc.progress import Tqdm - - hash_md5 = hashlib.md5() - binary = not istextfile(fname, fs=fs) - size = fs.size(fname) or 0 - no_progress_bar = True - if size >= LARGE_FILE_SIZE: - no_progress_bar = False - msg = ( - f"Computing md5 for a large file '{fname}'. " - "This is only done once." - ) - logger.info(msg) - - with Tqdm( - desc=str(fname), - disable=no_progress_bar, - total=size, - bytes=True, - leave=False, - ) as pbar: - with fs.open(fname, "rb") as fobj: - _fobj_md5(fobj, hash_md5, binary, pbar.update) - - return hash_md5.hexdigest() - - def bytes_hash(byts, typ): hasher = getattr(hashlib, typ)() hasher.update(byts) @@ -370,7 +317,6 @@ def resolve_output(inp, out): def resolve_paths(repo, out, always_local=False): from urllib.parse import urlparse - from dvc.fs import system from dvc.fs.local import localfs from ..dvcfile import DVC_FILE_SUFFIX @@ -390,7 +336,7 @@ def resolve_paths(repo, out, always_local=False): if scheme or not localfs.path.isin_or_eq(abspath, repo.root_dir): wdir = os.getcwd() elif contains_symlink_up_to(dirname, repo.root_dir) or ( - os.path.isdir(abspath) and system.is_symlink(abspath) + os.path.isdir(abspath) and localfs.is_symlink(abspath) ): msg = ( "Cannot add files inside symlinked directories to DVC. " @@ -471,10 +417,6 @@ def parse_target( return path or default, name -def is_exec(mode): - return bool(mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)) - - def glob_targets(targets, glob=True, recursive=True): from ..exceptions import DvcException diff --git a/tests/func/objects/test_hash.py b/tests/func/objects/test_hash.py new file mode 100644 index 0000000000..10eae3ff46 --- /dev/null +++ b/tests/func/objects/test_hash.py @@ -0,0 +1,16 @@ +from dvc.fs import LocalFileSystem +from dvc.objects.hash import file_md5 + + +def test_file_md5(tmp_dir): + tmp_dir.gen("foo", "foo content") + + fs = LocalFileSystem() + assert file_md5("foo", fs) == file_md5("foo", fs) + + +def test_file_md5_crlf(tmp_dir): + fs = LocalFileSystem() + tmp_dir.gen("cr", b"a\nb\nc") + tmp_dir.gen("crlf", b"a\r\nb\r\nc") + assert file_md5("cr", fs) == file_md5("crlf", fs) diff --git a/tests/func/test_add.py b/tests/func/test_add.py index f092f91a2b..9b3ee552e6 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -23,6 +23,7 @@ ) from dvc.fs import system from dvc.fs.local import LocalFileSystem +from dvc.objects.hash import file_md5 from dvc.objects.hash_info import HashInfo from dvc.output import ( OutputAlreadyTrackedError, @@ -35,7 +36,7 @@ StagePathNotFoundError, ) from dvc.testing.test_workspace import TestAdd -from dvc.utils import LARGE_DIR_SIZE, file_md5, relpath +from dvc.utils import LARGE_DIR_SIZE, relpath from dvc.utils.fs import path_isin from dvc.utils.serialize import YAMLFileCorruptedError, load_yaml from tests.basic_env import TestDvc @@ -378,7 +379,7 @@ def test_dir(self): def test_should_update_state_entry_for_file_after_add(mocker, dvc, tmp_dir): - file_md5_counter = mocker.spy(dvc_module.data.stage, "file_md5") + file_md5_counter = mocker.spy(dvc_module.objects.hash, "file_md5") tmp_dir.gen("foo", "foo") ret = main(["config", "cache.type", "copy"]) @@ -409,7 +410,7 @@ def test_should_update_state_entry_for_file_after_add(mocker, dvc, tmp_dir): def test_should_update_state_entry_for_directory_after_add( mocker, dvc, tmp_dir ): - file_md5_counter = mocker.spy(dvc_module.data.stage, "file_md5") + file_md5_counter = mocker.spy(dvc_module.objects.hash, "file_md5") tmp_dir.gen({"data/data": "foo", "data/data_sub/sub_data": "foo"}) diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index eea0e549b4..8061405e92 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -149,7 +149,7 @@ def test_warn_on_outdated_stage(tmp_dir, dvc, local_remote, caplog): def test_hash_recalculation(mocker, dvc, tmp_dir, local_remote): tmp_dir.gen({"foo": "foo"}) - test_file_md5 = mocker.spy(dvc_module.data.stage, "file_md5") + test_file_md5 = mocker.spy(dvc_module.objects.hash, "file_md5") ret = main(["config", "cache.type", "hardlink"]) assert ret == 0 ret = main(["add", "foo"]) @@ -211,7 +211,7 @@ def test_verify_hashes( remove("dir") remove(dvc.odb.local.cache_dir) - hash_spy = mocker.spy(dvc_module.data.stage, "file_md5") + hash_spy = mocker.spy(dvc_module.objects.hash, "file_md5") dvc.pull() assert hash_spy.call_count == 0 diff --git a/tests/func/test_install.py b/tests/func/test_install.py index ee6fde2980..76ebca454c 100644 --- a/tests/func/test_install.py +++ b/tests/func/test_install.py @@ -6,7 +6,7 @@ from git import GitCommandError from dvc.exceptions import DvcException -from dvc.utils import file_md5 +from dvc.objects.hash import file_md5 from tests.func.parsing.test_errors import escape_ansi diff --git a/tests/func/test_repro.py b/tests/func/test_repro.py index b783cc25b3..7abed51d72 100644 --- a/tests/func/test_repro.py +++ b/tests/func/test_repro.py @@ -16,10 +16,11 @@ ) from dvc.fs import system from dvc.fs.local import LocalFileSystem +from dvc.objects.hash import file_md5 from dvc.output import Output from dvc.stage import Stage from dvc.stage.exceptions import StageFileDoesNotExistError -from dvc.utils import file_md5, relpath +from dvc.utils import relpath from dvc.utils.fs import remove from dvc.utils.serialize import dump_yaml, load_yaml from tests.basic_env import TestDvc diff --git a/tests/func/test_run_single_stage.py b/tests/func/test_run_single_stage.py index 2c500a1eef..1a4992a27f 100644 --- a/tests/func/test_run_single_stage.py +++ b/tests/func/test_run_single_stage.py @@ -20,6 +20,7 @@ StagePathAsOutputError, ) from dvc.fs import system +from dvc.objects.hash import file_md5 from dvc.output import Output, OutputIsStageFileError from dvc.repo import Repo as DvcRepo from dvc.stage import Stage @@ -30,7 +31,6 @@ StagePathNotFoundError, StagePathOutsideError, ) -from dvc.utils import file_md5 from dvc.utils.serialize import load_yaml from tests.basic_env import TestDvc, TestDvcGit diff --git a/tests/func/test_state.py b/tests/func/test_state.py index 1d0f77649a..f02d70bf78 100644 --- a/tests/func/test_state.py +++ b/tests/func/test_state.py @@ -1,10 +1,10 @@ import os import re +from dvc.objects.hash import file_md5 from dvc.objects.hash_info import HashInfo from dvc.objects.state import State from dvc.repo import Repo -from dvc.utils import file_md5 def test_state(tmp_dir, dvc): diff --git a/tests/func/test_utils.py b/tests/func/test_utils.py index 7910f4fbac..0a1834efb4 100644 --- a/tests/func/test_utils.py +++ b/tests/func/test_utils.py @@ -4,14 +4,6 @@ from dvc import utils from dvc.exceptions import DvcException -from dvc.fs.local import LocalFileSystem - - -def test_file_md5_crlf(tmp_dir): - fs = LocalFileSystem() - tmp_dir.gen("cr", b"a\nb\nc") - tmp_dir.gen("crlf", b"a\r\nb\r\nc") - assert utils.file_md5("cr", fs) == utils.file_md5("crlf", fs) def test_dict_md5(): diff --git a/tests/unit/fs/test_data.py b/tests/unit/fs/test_data.py index f56985af6c..eba315046e 100644 --- a/tests/unit/fs/test_data.py +++ b/tests/unit/fs/test_data.py @@ -232,9 +232,9 @@ def test_get_hash_dir(tmp_dir, dvc, mocker): {"dir": {"foo": "foo", "bar": "bar", "subdir": {"data": "data"}}} ) fs = DataFileSystem(repo=dvc) - get_file_hash_spy = mocker.spy(dvc_module.data.stage, "get_file_hash") + hash_file_spy = mocker.spy(dvc_module.objects.hash, "hash_file") assert fs.info("dir")["md5"] == "8761c4e9acad696bee718615e23e22db.dir" - assert not get_file_hash_spy.called + assert not hash_file_spy.called def test_get_hash_granular(tmp_dir, dvc): diff --git a/tests/unit/fs/test_dvc.py b/tests/unit/fs/test_dvc.py index 9dae47e3aa..d1a622de11 100644 --- a/tests/unit/fs/test_dvc.py +++ b/tests/unit/fs/test_dvc.py @@ -576,8 +576,8 @@ def test_get_hash_mixed_dir(tmp_dir, scm, dvc): def test_get_hash_dirty_file(tmp_dir, dvc): from dvc.data import check - from dvc.data.stage import get_file_hash from dvc.objects.errors import ObjectFormatError + from dvc.objects.hash import hash_file tmp_dir.dvc_gen("file", "file") file_hash_info = HashInfo("md5", "8c7dd922ad47494fc02c388e12c00eac") @@ -588,7 +588,7 @@ def test_get_hash_dirty_file(tmp_dir, dvc): clean_staging() # file is modified in workspace - # get_file_hash(file) should return workspace hash, not DVC cached hash + # hash_file(file) should return workspace hash, not DVC cached hash fs = DvcFileSystem(repo=dvc) assert fs.info("file").get("md5") is None staging, _, obj = stage(dvc.odb.local, "file", fs, "md5") @@ -601,9 +601,9 @@ def test_get_hash_dirty_file(tmp_dir, dvc): with pytest.raises(ObjectFormatError): check(staging, obj) - # get_file_hash(file) should return DVC cached hash + # hash_file(file) should return DVC cached hash assert fs.info("file")["md5"] == file_hash_info.value - _, hash_info = get_file_hash("file", fs, "md5", state=dvc.state) + _, hash_info = hash_file("file", fs, "md5", state=dvc.state) assert hash_info == file_hash_info # tmp_dir/file can be staged even though it is missing in workspace since diff --git a/tests/unit/utils/test_stream.py b/tests/unit/utils/test_stream.py index cb9877f196..edbb15774b 100644 --- a/tests/unit/utils/test_stream.py +++ b/tests/unit/utils/test_stream.py @@ -1,9 +1,9 @@ import pytest +from dvc.data.stream import HashedStreamReader from dvc.fs.local import LocalFileSystem +from dvc.objects.hash import file_md5 from dvc.objects.istextfile import DEFAULT_CHUNK_SIZE, istextfile -from dvc.utils import file_md5 -from dvc.utils.stream import HashedStreamReader def test_hashed_stream_reader(tmp_dir): diff --git a/tests/unit/utils/test_utils.py b/tests/unit/utils/test_utils.py index a4800b461b..5971b5fff3 100644 --- a/tests/unit/utils/test_utils.py +++ b/tests/unit/utils/test_utils.py @@ -3,10 +3,8 @@ import pytest -from dvc.fs.local import LocalFileSystem from dvc.utils import ( dict_sha256, - file_md5, fix_env, parse_target, relpath, @@ -83,13 +81,6 @@ def test_fix_env_pyenv(path, orig): assert fix_env(env)["PATH"] == orig -def test_file_md5(tmp_dir): - tmp_dir.gen("foo", "foo content") - - fs = LocalFileSystem() - assert file_md5("foo", fs) == file_md5("foo", fs) - - def test_tmp_fname(): file_path = os.path.join("path", "to", "file")