Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] cache/remote: drop dos2unix MD5 by default #5337

Closed
wants to merge 12 commits into from
7 changes: 7 additions & 0 deletions dvc/cache/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,22 @@ class CloudCache:
CACHE_MODE: Optional[int] = None

def __init__(self, tree):
from dvc.odb.base import BaseODB

self.tree = tree
self.repo = tree.repo
self.odb = BaseODB(self.tree, path=self.cache_dir)

self.cache_types = tree.config.get("type") or copy(
self.DEFAULT_CACHE_TYPES
)
self.cache_type_confirmed = False
self._dir_info = {}

@property
def cache_dir(self):
return None

def get_dir_cache(self, hash_info):
assert hash_info

Expand Down
1 change: 1 addition & 0 deletions dvc/config_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ class RelPath(str):
Optional("autostage", default=False): Bool,
Optional("experiments"): Bool, # obsoleted
Optional("check_update", default=True): Bool,
Optional("dos2unix", default=False): Bool,
},
"cache": {
"local": str,
Expand Down
12 changes: 12 additions & 0 deletions dvc/hash_info.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import enum
from collections import OrderedDict
from dataclasses import dataclass, field
from typing import Dict, Optional
Expand All @@ -7,6 +8,17 @@
DirInfo = Dict[str, str]


class HashType(str, enum.Enum):

ETAG = "etag"
HDFS_CHECKSUM = "checksum"
MD5 = "md5"

@classmethod
def all_types(cls):
return [typ.value for typ in cls]


@dataclass
class HashInfo:
PARAM_SIZE = "size"
Expand Down
Empty file added dvc/odb/__init__.py
Empty file.
103 changes: 103 additions & 0 deletions dvc/odb/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import logging
from typing import TYPE_CHECKING, Optional, Union

from voluptuous import MultipleInvalid

from dvc.exceptions import DvcException
from dvc.parsing.versions import SCHEMA_KWD
from dvc.path_info import PathInfo

from .versions import ODB_VERSION

if TYPE_CHECKING:
from dvc.tree.base import BaseTree

logger = logging.getLogger(__name__)


class ODBConfigFormatError(DvcException):
pass


def get_odb_schema(d):
from dvc.schema import COMPILED_ODB_CONFIG_V2_SCHEMA

schema = {ODB_VERSION.V2: COMPILED_ODB_CONFIG_V2_SCHEMA}
version = ODB_VERSION.from_dict(d)
return schema[version]


class BaseODB:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As it is now this is really more of an ODBConfig class, but given that nothing else will probably get refactored into ODB for 2.0 I did not spend too much time thinking about how everything should be organized in the long run

Copy link
Member

@skshetry skshetry Jan 26, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pmrowla, is this later going to evolve and be used in chunking?

Copy link
Contributor Author

@pmrowla pmrowla Jan 26, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@skshetry yes, that's the plan for now at least

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe then just ObjectDB?


CONFIG_FILE = "dvc.odb.yaml"

def __init__(
self, tree: "BaseTree", path: Optional[Union[str, "PathInfo"]] = None
):
self.tree = tree
if path:
if isinstance(path, str):
self.path_info = PathInfo(path)
else:
self.path_info = path
else:
self.path_info = tree.path_info
self.config = self._load_config()

@property
def config_path(self):
if self.path_info:
return self.path_info / self.CONFIG_FILE
return None

def _load_config(self):
from dvc.utils.serialize import load_yaml

if not self.config_path:
return self.latest_version_info

if self.tree.exists(self.config_path):
data = load_yaml(self.config_path, tree=self.tree)
try:
self._validate_version(data)
return data
except MultipleInvalid:
pass
return {}

@classmethod
def _validate_version(cls, d):
schema = get_odb_schema(d)
try:
return schema(d)
except MultipleInvalid as exc:
raise ODBConfigFormatError(
f"'{cls.CONFIG_FILE}' format error: {exc}"
)

@property
def version(self):
return ODB_VERSION.from_dict(self.config)

@property
def latest_version_info(self):
version = ODB_VERSION.V2.value # pylint:disable=no-member
return {SCHEMA_KWD: version}

def _dump_config(self):
from dvc.utils.serialize import modify_yaml

if not self.config_path:
return

logger.debug("Writing ODB config '%s'", self.config_path)
if not self.tree.exists(self.config_path.parent):
self.tree.makedirs(self.config_path.parent)
with modify_yaml(self.config_path, tree=self.tree) as data:
data.update(self.config)

def migrate_config(self):
if self.version == ODB_VERSION.V1 and not self.tree.enable_dos2unix:
logger.debug("Migrating ODB config '%s' to v2", self.config_path)
self.config.update(self.latest_version_info)
self._dump_config()
40 changes: 40 additions & 0 deletions dvc/odb/versions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import enum
from collections.abc import Mapping

from voluptuous import validators

from dvc.parsing.versions import SCHEMA_KWD


def odb_version_schema(value):
expected = [ODB_VERSION.V2.value] # pylint: disable=no-member
msg = "invalid schema version {}, expected one of {}".format(
value, expected
)
return validators.Any(*expected, msg=msg)(value)


class VersionEnum(str, enum.Enum):
@classmethod
def all_versions(cls):
return [v.value for v in cls]


class ODB_VERSION(VersionEnum):
V1 = "1.0" # DVC <2.0 (dos2unix MD5)
V2 = "2.0" # DVC 2.x (standard MD5)

@classmethod
def from_dict(cls, data):
# 1) if it's empty or or is not a dict, use the oldest one (V1).
# 2) use the `schema` identifier if it exists and is a supported
# version
# 3) if it's not in any of the supported version, use the latest one
# 4) if there's no identifier, it's a V1
if not data or not isinstance(data, Mapping):
return cls(cls.V1)

version = data.get(SCHEMA_KWD)
if version:
return cls(version if version in cls.all_versions() else cls.V2)
return cls(cls.V1)
13 changes: 2 additions & 11 deletions dvc/output/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from funcy import collecting, project
from voluptuous import And, Any, Coerce, Length, Lower, Required, SetTo

from dvc.hash_info import HashInfo
from dvc.hash_info import HashInfo, HashType
from dvc.output.base import BaseOutput
from dvc.output.gs import GSOutput
from dvc.output.hdfs import HDFSOutput
Expand All @@ -15,10 +15,6 @@
from dvc.scheme import Schemes

from ..tree import get_cloud_tree
from ..tree.hdfs import HDFSTree
from ..tree.local import LocalTree
from ..tree.s3 import S3Tree
from ..tree.webhdfs import WebHDFSTree

OUTS = [
HDFSOutput,
Expand Down Expand Up @@ -52,12 +48,7 @@
#
# so when a few types of outputs share the same name, we only need
# specify it once.
CHECKSUMS_SCHEMA = {
LocalTree.PARAM_CHECKSUM: CHECKSUM_SCHEMA,
S3Tree.PARAM_CHECKSUM: CHECKSUM_SCHEMA,
HDFSTree.PARAM_CHECKSUM: CHECKSUM_SCHEMA,
WebHDFSTree.PARAM_CHECKSUM: CHECKSUM_SCHEMA,
}
CHECKSUMS_SCHEMA = {typ: CHECKSUM_SCHEMA for typ in HashType.all_types()}

SCHEMA = CHECKSUMS_SCHEMA.copy()
SCHEMA[Required(BaseOutput.PARAM_PATH)] = str
Expand Down
3 changes: 3 additions & 0 deletions dvc/remote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,12 @@ class Remote:

def __init__(self, tree):
from dvc.cache import get_cloud_cache
from dvc.odb.base import BaseODB

self.tree = tree
self.repo = tree.repo
self.cache = get_cloud_cache(self.tree)
self.odb = BaseODB(self.tree)

config = tree.config
url = config.get("url")
Expand Down Expand Up @@ -450,6 +452,7 @@ def create_taskset(amount):

@index_locked
def push(self, cache, named_cache, jobs=None, show_checksums=False):
self.odb.migrate_config()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updating/writing the config should really be done as needed after an initial odb.write/save/etc operation, but since we don't do anything else with ODB at the moment writing the config file to a remote on push should be sufficient for 2.0

ret = self._process(
cache,
named_cache,
Expand Down
6 changes: 6 additions & 0 deletions dvc/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from dvc import dependency, output
from dvc.hash_info import HashInfo
from dvc.odb.versions import odb_version_schema
from dvc.output import CHECKSUMS_SCHEMA, BaseOutput
from dvc.parsing import DO_KWD, FOREACH_KWD, VARS_KWD
from dvc.parsing.versions import SCHEMA_KWD, lockfile_version_schema
Expand Down Expand Up @@ -111,3 +112,8 @@
COMPILED_LOCK_FILE_STAGE_SCHEMA = Schema(LOCK_FILE_STAGE_SCHEMA)
COMPILED_LOCKFILE_V1_SCHEMA = Schema(LOCKFILE_V1_SCHEMA)
COMPILED_LOCKFILE_V2_SCHEMA = Schema(LOCKFILE_V2_SCHEMA)

ODB_VERSION_SCHEMA = {
Required(SCHEMA_KWD): odb_version_schema,
}
COMPILED_ODB_CONFIG_V2_SCHEMA = Schema(ODB_VERSION_SCHEMA)
4 changes: 2 additions & 2 deletions dvc/tree/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from funcy import cached_property, wrap_prop

from dvc.hash_info import HashInfo
from dvc.hash_info import HashInfo, HashType
from dvc.path_info import CloudURLInfo
from dvc.progress import Tqdm
from dvc.scheme import Schemes
Expand All @@ -22,7 +22,7 @@ class AzureTree(BaseTree):
"azure-storage-blob": "azure.storage.blob",
"knack": "knack",
}
PARAM_CHECKSUM = "etag"
PARAM_CHECKSUM = HashType.ETAG.value
COPY_POLL_SECONDS = 5
LIST_OBJECT_PAGE_SIZE = 5000

Expand Down
4 changes: 4 additions & 0 deletions dvc/tree/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ def hash_jobs(self):
or self.HASH_JOBS
)

@cached_property
def enable_dos2unix(self):
return self.repo and self.repo.config["core"].get("dos2unix")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this really belongs in BaseTree long term, but currently we compute MD5's from both RepoTree and LocalTree


@classmethod
def get_missing_deps(cls):
import importlib
Expand Down
4 changes: 2 additions & 2 deletions dvc/tree/gs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from funcy import cached_property, wrap_prop

from dvc.exceptions import DvcException
from dvc.hash_info import HashInfo
from dvc.hash_info import HashInfo, HashType
from dvc.path_info import CloudURLInfo
from dvc.progress import Tqdm
from dvc.scheme import Schemes
Expand Down Expand Up @@ -70,7 +70,7 @@ class GSTree(BaseTree):
scheme = Schemes.GS
PATH_CLS = CloudURLInfo
REQUIRES = {"google-cloud-storage": "google.cloud.storage"}
PARAM_CHECKSUM = "md5"
PARAM_CHECKSUM = HashType.MD5.value

def __init__(self, repo, config):
super().__init__(repo, config)
Expand Down
4 changes: 2 additions & 2 deletions dvc/tree/hdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from contextlib import closing, contextmanager
from urllib.parse import urlparse

from dvc.hash_info import HashInfo
from dvc.hash_info import HashInfo, HashType
from dvc.progress import Tqdm
from dvc.scheme import Schemes
from dvc.utils import fix_env, tmp_fname
Expand Down Expand Up @@ -62,7 +62,7 @@ class HDFSTree(BaseTree):
scheme = Schemes.HDFS
REQUIRES = {"pyarrow": "pyarrow"}
REGEX = r"^hdfs://((?P<user>.*)@)?.*$"
PARAM_CHECKSUM = "checksum"
PARAM_CHECKSUM = HashType.HDFS_CHECKSUM.value
TRAVERSE_PREFIX_LEN = 2

def __init__(self, repo, config):
Expand Down
4 changes: 2 additions & 2 deletions dvc/tree/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import dvc.prompt as prompt
from dvc.exceptions import DvcException, HTTPError
from dvc.hash_info import HashInfo
from dvc.hash_info import HashInfo, HashType
from dvc.path_info import HTTPURLInfo
from dvc.progress import Tqdm
from dvc.scheme import Schemes
Expand All @@ -28,7 +28,7 @@ def ask_password(host, user):
class HTTPTree(BaseTree): # pylint:disable=abstract-method
scheme = Schemes.HTTP
PATH_CLS = HTTPURLInfo
PARAM_CHECKSUM = "etag"
PARAM_CHECKSUM = HashType.ETAG.value
CAN_TRAVERSE = False

SESSION_RETRIES = 5
Expand Down
9 changes: 6 additions & 3 deletions dvc/tree/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from funcy import cached_property

from dvc.hash_info import HashInfo
from dvc.hash_info import HashInfo, HashType
from dvc.path_info import PathInfo
from dvc.scheme import Schemes
from dvc.system import System
Expand All @@ -20,7 +20,7 @@
class LocalTree(BaseTree):
scheme = Schemes.LOCAL
PATH_CLS = PathInfo
PARAM_CHECKSUM = "md5"
PARAM_CHECKSUM = HashType.MD5.value
PARAM_PATH = "path"
TRAVERSE_PREFIX_LEN = 2

Expand Down Expand Up @@ -231,7 +231,10 @@ def chmod(self, path_info, mode):
raise

def get_file_hash(self, path_info):
hash_info = HashInfo(self.PARAM_CHECKSUM, file_md5(path_info)[0],)
hash_info = HashInfo(
self.PARAM_CHECKSUM,
file_md5(path_info, enable_dos2unix=self.enable_dos2unix)[0],
)

if hash_info:
hash_info.size = os.path.getsize(path_info)
Expand Down
Loading