-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WIP] cache/remote: drop dos2unix MD5 by default #5337
Changes from all commits
ca81c8f
b01e554
afe7787
3710c1a
d645633
b5aade4
c044932
d465bdf
beef3e9
fa76a7f
7df05b1
465d48a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
import logging | ||
from typing import TYPE_CHECKING, Optional, Union | ||
|
||
from voluptuous import MultipleInvalid | ||
|
||
from dvc.exceptions import DvcException | ||
from dvc.parsing.versions import SCHEMA_KWD | ||
from dvc.path_info import PathInfo | ||
|
||
from .versions import ODB_VERSION | ||
|
||
if TYPE_CHECKING: | ||
from dvc.tree.base import BaseTree | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class ODBConfigFormatError(DvcException): | ||
pass | ||
|
||
|
||
def get_odb_schema(d): | ||
from dvc.schema import COMPILED_ODB_CONFIG_V2_SCHEMA | ||
|
||
schema = {ODB_VERSION.V2: COMPILED_ODB_CONFIG_V2_SCHEMA} | ||
version = ODB_VERSION.from_dict(d) | ||
return schema[version] | ||
|
||
|
||
class BaseODB: | ||
|
||
CONFIG_FILE = "dvc.odb.yaml" | ||
|
||
def __init__( | ||
self, tree: "BaseTree", path: Optional[Union[str, "PathInfo"]] = None | ||
): | ||
self.tree = tree | ||
if path: | ||
if isinstance(path, str): | ||
self.path_info = PathInfo(path) | ||
else: | ||
self.path_info = path | ||
else: | ||
self.path_info = tree.path_info | ||
self.config = self._load_config() | ||
|
||
@property | ||
def config_path(self): | ||
if self.path_info: | ||
return self.path_info / self.CONFIG_FILE | ||
return None | ||
|
||
def _load_config(self): | ||
from dvc.utils.serialize import load_yaml | ||
|
||
if not self.config_path: | ||
return self.latest_version_info | ||
|
||
if self.tree.exists(self.config_path): | ||
data = load_yaml(self.config_path, tree=self.tree) | ||
try: | ||
self._validate_version(data) | ||
return data | ||
except MultipleInvalid: | ||
pass | ||
return {} | ||
|
||
@classmethod | ||
def _validate_version(cls, d): | ||
schema = get_odb_schema(d) | ||
try: | ||
return schema(d) | ||
except MultipleInvalid as exc: | ||
raise ODBConfigFormatError( | ||
f"'{cls.CONFIG_FILE}' format error: {exc}" | ||
) | ||
|
||
@property | ||
def version(self): | ||
return ODB_VERSION.from_dict(self.config) | ||
|
||
@property | ||
def latest_version_info(self): | ||
version = ODB_VERSION.V2.value # pylint:disable=no-member | ||
return {SCHEMA_KWD: version} | ||
|
||
def _dump_config(self): | ||
from dvc.utils.serialize import modify_yaml | ||
|
||
if not self.config_path: | ||
return | ||
|
||
logger.debug("Writing ODB config '%s'", self.config_path) | ||
if not self.tree.exists(self.config_path.parent): | ||
self.tree.makedirs(self.config_path.parent) | ||
with modify_yaml(self.config_path, tree=self.tree) as data: | ||
data.update(self.config) | ||
|
||
def migrate_config(self): | ||
if self.version == ODB_VERSION.V1 and not self.tree.enable_dos2unix: | ||
logger.debug("Migrating ODB config '%s' to v2", self.config_path) | ||
self.config.update(self.latest_version_info) | ||
self._dump_config() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import enum | ||
from collections.abc import Mapping | ||
|
||
from voluptuous import validators | ||
|
||
from dvc.parsing.versions import SCHEMA_KWD | ||
|
||
|
||
def odb_version_schema(value): | ||
expected = [ODB_VERSION.V2.value] # pylint: disable=no-member | ||
msg = "invalid schema version {}, expected one of {}".format( | ||
value, expected | ||
) | ||
return validators.Any(*expected, msg=msg)(value) | ||
|
||
|
||
class VersionEnum(str, enum.Enum): | ||
@classmethod | ||
def all_versions(cls): | ||
return [v.value for v in cls] | ||
|
||
|
||
class ODB_VERSION(VersionEnum): | ||
V1 = "1.0" # DVC <2.0 (dos2unix MD5) | ||
V2 = "2.0" # DVC 2.x (standard MD5) | ||
|
||
@classmethod | ||
def from_dict(cls, data): | ||
# 1) if it's empty or or is not a dict, use the oldest one (V1). | ||
# 2) use the `schema` identifier if it exists and is a supported | ||
# version | ||
# 3) if it's not in any of the supported version, use the latest one | ||
# 4) if there's no identifier, it's a V1 | ||
if not data or not isinstance(data, Mapping): | ||
return cls(cls.V1) | ||
|
||
version = data.get(SCHEMA_KWD) | ||
if version: | ||
return cls(version if version in cls.all_versions() else cls.V2) | ||
return cls(cls.V1) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -69,10 +69,12 @@ class Remote: | |
|
||
def __init__(self, tree): | ||
from dvc.cache import get_cloud_cache | ||
from dvc.odb.base import BaseODB | ||
|
||
self.tree = tree | ||
self.repo = tree.repo | ||
self.cache = get_cloud_cache(self.tree) | ||
self.odb = BaseODB(self.tree) | ||
|
||
config = tree.config | ||
url = config.get("url") | ||
|
@@ -450,6 +452,7 @@ def create_taskset(amount): | |
|
||
@index_locked | ||
def push(self, cache, named_cache, jobs=None, show_checksums=False): | ||
self.odb.migrate_config() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updating/writing the config should really be done as needed after an initial |
||
ret = self._process( | ||
cache, | ||
named_cache, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -90,6 +90,10 @@ def hash_jobs(self): | |
or self.HASH_JOBS | ||
) | ||
|
||
@cached_property | ||
def enable_dos2unix(self): | ||
return self.repo and self.repo.config["core"].get("dos2unix") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think this really belongs in |
||
|
||
@classmethod | ||
def get_missing_deps(cls): | ||
import importlib | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As it is now this is really more of an
ODBConfig
class, but given that nothing else will probably get refactored into ODB for 2.0 I did not spend too much time thinking about how everything should be organized in the long runThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@pmrowla, is this later going to evolve and be used in chunking?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@skshetry yes, that's the plan for now at least
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe then just
ObjectDB
?