From bc338c3695b9056ff0f5c6d338f2601e0fc8aa05 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Mon, 29 May 2023 16:16:31 +0900 Subject: [PATCH] output: deprecate dos2unix md5 hashing --- dvc/config_schema.py | 1 + dvc/output.py | 13 +++++++++++++ dvc/repo/imports.py | 3 ++- dvc/repo/init.py | 5 +++-- tests/func/test_add.py | 22 ++++++++++++++++++++++ tests/func/test_init.py | 5 +++++ 6 files changed, 46 insertions(+), 3 deletions(-) diff --git a/dvc/config_schema.py b/dvc/config_schema.py index 2e36e90197..43221df09e 100644 --- a/dvc/config_schema.py +++ b/dvc/config_schema.py @@ -152,6 +152,7 @@ def __call__(self, data): Optional("check_update", default=True): Bool, "site_cache_dir": str, "machine": Lower, + Optional("legacy_md5", default=True): Bool, }, "cache": { "local": str, diff --git a/dvc/output.py b/dvc/output.py index 10f818fa91..4698e3d1f6 100644 --- a/dvc/output.py +++ b/dvc/output.py @@ -515,6 +515,12 @@ def cache_path(self): self.cache.oid_to_path(self.hash_info.value) ) + @cached_property + def _is_text(self) -> Optional[bool]: + if self.repo.config["core"].get("legacy_md5", True): + return None + return False + def get_hash(self): _, hash_info = self._get_hash_meta() return hash_info @@ -531,6 +537,7 @@ def _get_hash_meta(self): self.hash_name, ignore=self.dvcignore, dry_run=not self.use_cache, + text=self._is_text, ) return meta, obj.hash_info @@ -671,6 +678,7 @@ def save(self) -> None: self.fs, self.hash_name, ignore=self.dvcignore, + text=self._is_text, ) else: _, self.meta, self.obj = build( @@ -680,6 +688,7 @@ def save(self) -> None: self.hash_name, ignore=self.dvcignore, dry_run=True, + text=self._is_text, ) if not self.IS_DEPENDENCY: logger.debug("Output '%s' doesn't use cache. Skipping saving.", self) @@ -727,6 +736,7 @@ def commit(self, filter_info=None, relink=True) -> None: self.fs, self.hash_name, ignore=self.dvcignore, + text=self._is_text, ) otransfer( staging, @@ -758,6 +768,7 @@ def _commit_granular_dir(self, filter_info, hardlink) -> Optional["HashFile"]: self.fs, self.hash_name, ignore=self.dvcignore, + text=self._is_text, ) assert isinstance(obj, Tree) save_obj = obj.filter(prefix) @@ -973,6 +984,7 @@ def transfer( "md5", upload=upload, no_progress_bar=no_progress_bar, + text=self._is_text, ) otransfer( staging, @@ -1305,6 +1317,7 @@ def add( # noqa: C901 self.hash_name, ignore=self.dvcignore, dry_run=not self.use_cache, + text=self._is_text, ) except FileNotFoundError as exc: if self.fs_path == path: diff --git a/dvc/repo/imports.py b/dvc/repo/imports.py index d6f27a9051..7a39710d4d 100644 --- a/dvc/repo/imports.py +++ b/dvc/repo/imports.py @@ -101,6 +101,7 @@ def save_imports( logger.warning(str(DataSourceChanged(f"{dep.stage} ({dep})"))) data_view = unfetched.data["repo"] + is_text = None if repo.config["core"].get("legacy_md5", True) else False if len(data_view): cache = repo.cache.local if not cache.fs.exists(cache.path): @@ -111,7 +112,7 @@ def save_imports( unit="files", ) as cb: checkout(data_view, tmpdir, cache.fs, callback=cb, storage="data") - md5(data_view) + md5(data_view, text=is_text) save(data_view, odb=cache, hardlink=True) downloaded.update( diff --git a/dvc/repo/init.py b/dvc/repo/init.py index 4a68a2bb2c..37bc97f401 100644 --- a/dvc/repo/init.py +++ b/dvc/repo/init.py @@ -67,8 +67,9 @@ def init(root_dir=os.curdir, no_scm=False, force=False, subdir=False): # noqa: config = Config.init(dvc_dir) - if no_scm: - with config.edit() as conf: + with config.edit() as conf: + conf["core"]["legacy_md5"] = False + if no_scm: conf["core"]["no_scm"] = True dvcignore = init_dvcignore(root_dir) diff --git a/tests/func/test_add.py b/tests/func/test_add.py index 48b2555529..888930ca58 100644 --- a/tests/func/test_add.py +++ b/tests/func/test_add.py @@ -8,6 +8,7 @@ import pytest +import dvc.output as output_module import dvc_data from dvc.cachemgr import CacheManager from dvc.cli import main @@ -1113,3 +1114,24 @@ def test_add_updates_to_cloud_versioning_dir(tmp_dir, dvc): } ] } + + +@pytest.mark.parametrize( + "legacy_md5,expected_hash", + [ + (False, "3dbec9c1b92200eb56349835275e00b9"), + (True, "f47c75614087a8dd938ba4acff252494"), + ], +) +def test_add_legacy_md5(tmp_dir, dvc, mocker, legacy_md5, expected_hash): + with dvc.config.edit() as conf: + conf["core"]["legacy_md5"] = legacy_md5 + expected_text = None if legacy_md5 else False + tmp_dir.gen("foo", "foo\r\nbar\r\n") + + build_spy = mocker.spy(output_module, "build") + (stage,) = dvc.add("foo") + build_spy.assert_called() + for _args, kwargs in build_spy.call_args_list: + assert kwargs["text"] is expected_text + assert stage.outs[0].hash_info == HashInfo("md5", expected_hash) diff --git a/tests/func/test_init.py b/tests/func/test_init.py index 7944d5b1a6..9f301482ab 100644 --- a/tests/func/test_init.py +++ b/tests/func/test_init.py @@ -120,3 +120,8 @@ def test_init_when_ignored_by_git(tmp_dir, scm, caplog): ) in caplog.text ) + + +def test_init_legacy_md5(scm): + with DvcRepo.init() as repo: + assert repo.config["core"].get("legacy_md5") is False