From 2af9b303e0b216a4f268990860593d41cd8fc52e Mon Sep 17 00:00:00 2001 From: "brynn.zalmanek@pnnl.gov" Date: Wed, 8 Nov 2023 11:43:27 -0800 Subject: [PATCH 1/4] add checksum timestamp changes --- nmdc_runtime/api/core/util.py | 5 ++++- nmdc_runtime/api/endpoints/util.py | 1 + nmdc_runtime/util.py | 4 ++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/nmdc_runtime/api/core/util.py b/nmdc_runtime/api/core/util.py index 48d15ed1..ff3bcb4a 100644 --- a/nmdc_runtime/api/core/util.py +++ b/nmdc_runtime/api/core/util.py @@ -28,10 +28,13 @@ def hash_from_str(s: str, algo="sha256") -> str: return getattr(hashlib, algo)(s.encode("utf-8")).hexdigest() -def sha256hash_from_file(file_path: str): +def sha256hash_from_file(file_path: str, timestamp: str): # https://stackoverflow.com/a/55542529 h = hashlib.sha256() + timestamp_bytes = timestamp.encode('utf-8') + h.update(timestamp_bytes) + with open(file_path, "rb") as file: while True: # Reading is buffered, so we can read smaller chunks. diff --git a/nmdc_runtime/api/endpoints/util.py b/nmdc_runtime/api/endpoints/util.py index fd473682..35c63a5a 100644 --- a/nmdc_runtime/api/endpoints/util.py +++ b/nmdc_runtime/api/endpoints/util.py @@ -442,6 +442,7 @@ def persist_content_and_get_drs_object( ), "access_methods": [{"access_id": drs_id}], }, + timestamp= datetime.now(tz=ZoneInfo('America/Los_Angeles')).isoformat(timespec='minutes') ) ) self_uri = f"drs://{HOSTNAME_EXTERNAL}/{drs_id}" diff --git a/nmdc_runtime/util.py b/nmdc_runtime/util.py index 23e94dd1..475a98d4 100644 --- a/nmdc_runtime/util.py +++ b/nmdc_runtime/util.py @@ -82,7 +82,7 @@ def put_object(filepath, url, mime_type=None): return requests.put(url, data=f, headers={"Content-Type": mime_type}) -def drs_metadata_for(filepath, base=None): +def drs_metadata_for(filepath, base=None, timestamp=None): """given file path, get drs metadata required: size, created_time, and at least one checksum. @@ -96,7 +96,7 @@ def drs_metadata_for(filepath, base=None): ) if "checksums" not in base: base["checksums"] = [ - {"type": "sha256", "checksum": sha256hash_from_file(filepath)} + {"type": "sha256", "checksum": sha256hash_from_file(filepath, timestamp)} ] if "mime_type" not in base: base["mime_type"] = mimetypes.guess_type(filepath)[0] From 3232473faa3f2ac7c0c6a0766c60bf55156489ce Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Mon, 13 Nov 2023 10:43:03 -0500 Subject: [PATCH 2/4] style: black reformat --- nmdc_runtime/api/core/util.py | 2 +- nmdc_runtime/api/endpoints/util.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/nmdc_runtime/api/core/util.py b/nmdc_runtime/api/core/util.py index ff3bcb4a..ad97471f 100644 --- a/nmdc_runtime/api/core/util.py +++ b/nmdc_runtime/api/core/util.py @@ -32,7 +32,7 @@ def sha256hash_from_file(file_path: str, timestamp: str): # https://stackoverflow.com/a/55542529 h = hashlib.sha256() - timestamp_bytes = timestamp.encode('utf-8') + timestamp_bytes = timestamp.encode("utf-8") h.update(timestamp_bytes) with open(file_path, "rb") as file: diff --git a/nmdc_runtime/api/endpoints/util.py b/nmdc_runtime/api/endpoints/util.py index 35c63a5a..3c7a7da9 100644 --- a/nmdc_runtime/api/endpoints/util.py +++ b/nmdc_runtime/api/endpoints/util.py @@ -442,7 +442,9 @@ def persist_content_and_get_drs_object( ), "access_methods": [{"access_id": drs_id}], }, - timestamp= datetime.now(tz=ZoneInfo('America/Los_Angeles')).isoformat(timespec='minutes') + timestamp=datetime.now(tz=ZoneInfo("America/Los_Angeles")).isoformat( + timespec="minutes" + ), ) ) self_uri = f"drs://{HOSTNAME_EXTERNAL}/{drs_id}" From 51ff902aa65738372a82d46068a4a7ac0f11436d Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Mon, 13 Nov 2023 10:45:39 -0500 Subject: [PATCH 3/4] style: DRY --- nmdc_runtime/api/endpoints/util.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/nmdc_runtime/api/endpoints/util.py b/nmdc_runtime/api/endpoints/util.py index 3c7a7da9..f8279efb 100644 --- a/nmdc_runtime/api/endpoints/util.py +++ b/nmdc_runtime/api/endpoints/util.py @@ -431,6 +431,9 @@ def persist_content_and_get_drs_object( filepath = str(Path(save_dir).joinpath(filename)) with open(filepath, "w") as f: f.write(content) + now_to_the_minute = datetime.now(tz=ZoneInfo("America/Los_Angeles")).isoformat( + timespec="minutes" + ) object_in = DrsObjectIn( **drs_metadata_for( filepath, @@ -438,13 +441,11 @@ def persist_content_and_get_drs_object( "description": ( description + f" (created by/for {username}" - + f" at {datetime.now(tz=ZoneInfo('America/Los_Angeles')).isoformat(timespec='minutes')})" + + f" at {now_to_the_minute})" ), "access_methods": [{"access_id": drs_id}], }, - timestamp=datetime.now(tz=ZoneInfo("America/Los_Angeles")).isoformat( - timespec="minutes" - ), + timestamp=now_to_the_minute, ) ) self_uri = f"drs://{HOSTNAME_EXTERNAL}/{drs_id}" From fac1fc8632de1382b331919e62ad6f9be48bcbdd Mon Sep 17 00:00:00 2001 From: Donny Winston Date: Mon, 13 Nov 2023 11:01:38 -0500 Subject: [PATCH 4/4] add unit test --- tests/unit/core_util.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 tests/unit/core_util.py diff --git a/tests/unit/core_util.py b/tests/unit/core_util.py new file mode 100644 index 00000000..0fbd2873 --- /dev/null +++ b/tests/unit/core_util.py @@ -0,0 +1,21 @@ +from datetime import datetime, timedelta +from pathlib import Path +from zoneinfo import ZoneInfo + +from nmdc_runtime.api.core.util import sha256hash_from_file + +TEST_FILES_DIR = Path(__file__).parent.parent.joinpath("files") + + +def test_sha256hash_from_file_is_timestamp_dependent(): + file_path = str(TEST_FILES_DIR.joinpath("test_changesheet_update_one_ph.tsv")) + ts_1 = datetime.now(tz=ZoneInfo("America/Los_Angeles")) + ts_2 = ts_1 + timedelta(minutes=1) + hashes = [] + for ts in (ts_1, ts_2): + hashes.append( + sha256hash_from_file( + file_path=file_path, timestamp=ts.isoformat(timespec="minutes") + ) + ) + assert hashes[0] != hashes[1]