Skip to content

Commit

Permalink
cleaning logger and no_data access implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
jjkoehorst committed Aug 18, 2022
1 parent 540a5a8 commit 211348a
Showing 1 changed file with 44 additions and 38 deletions.
82 changes: 44 additions & 38 deletions cwltool/provenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,10 +412,10 @@ def write_bag_file(
return bag_file

def add_tagfile(
self,
path: str,
no_data: bool = False,
timestamp: Optional[datetime.datetime] = None,
self,
path: str,
no_data: bool = False,
timestamp: Optional[datetime.datetime] = None,
) -> None:
"""Add tag files to our research object."""
self.self_check()
Expand All @@ -425,15 +425,12 @@ def add_tagfile(
return
# FIXME: do the right thing for directories
with open(path, "rb") as tag_file:
_logger.error("Path: %s", path)
# FIXME: Should have more efficient open_tagfile() that
# does all checksums in one go while writing through,
# adding checksums after closing.
# Below probably OK for now as metadata files
# are not too large..?

_logger.info("Performing checksum calculations with no_data %s", no_data)
if no_data:
if cwltool.main.NO_DATA:
checksums[SHA1] = checksum_only(tag_file, hasher=hashlib.sha1)
tag_file.seek(0)
checksums[SHA256] = checksum_only(tag_file, hasher=hashlib.sha256)
Expand All @@ -446,8 +443,6 @@ def add_tagfile(
tag_file.seek(0)
checksums[SHA512] = checksum_copy(tag_file, hasher=hashlib.sha512)



rel_path = posix_path(os.path.relpath(path, self.folder))
self.tagfiles.add(rel_path)
self.add_to_manifest(rel_path, checksums)
Expand Down Expand Up @@ -767,7 +762,6 @@ def generate_snapshot(self, prov_dep: CWLObjectType, no_data: bool) -> None:

# FIXME: What if destination path already exists?
if os.path.exists(filepath):
_logger.error("Filepath: %s", filepath)
try:
if os.path.isdir(filepath):
shutil.copytree(filepath, path)
Expand Down Expand Up @@ -807,13 +801,18 @@ def add_data_file(
timestamp: Optional[datetime.datetime] = None,
content_type: Optional[str] = None,
) -> str:
# TODO only when --no-data is not used!
"""Copy inputs to data/ folder."""
self.self_check()
tmp_dir, tmp_prefix = os.path.split(self.temp_prefix)
with tempfile.NamedTemporaryFile(
prefix=tmp_prefix, dir=tmp_dir, delete=False
) as tmp:
checksum = checksum_only(from_fp, tmp)
# TODO this should depend on the arguments
if cwltool.main.NO_DATA:
checksum = checksum_only(from_fp)
else:
checksum = checksum_copy(from_fp, tmp)

# Calculate hash-based file path
folder = os.path.join(self.folder, DATA, checksum[0:2])
Expand All @@ -825,6 +824,7 @@ def add_data_file(
os.rename(tmp.name, path)

# Relative posix path
# TODO only when no-data is False?...
rel_path = posix_path(os.path.relpath(path, self.folder))

# Register in bagit checksum
Expand Down Expand Up @@ -901,7 +901,10 @@ def _add_to_bagit(self, rel_path: str, **checksums: str) -> None:
checksums = dict(checksums)
with open(lpath, "rb") as file_path:
# FIXME: Need sha-256 / sha-512 as well for Research Object BagIt profile?
checksums[SHA1] = checksum_only(file_path, hasher=hashlib.sha1)
if cwltool.main.NO_DATA:
checksums[SHA1] = checksum_only(file_path, hasher=hashlib.sha1)
else:
checksums[SHA1] = checksum_copy(file_path, hasher=hashlib.sha1)

self.add_to_manifest(rel_path, checksums)

Expand Down Expand Up @@ -1037,17 +1040,22 @@ def checksum_copy(
# TODO: Use hashlib.new(Hasher_str) instead?
checksum = hasher()
contents = src_file.read(buffersize)
# if dst_file and hasattr(dst_file, "name") and hasattr(src_file, "name"):
# temp_location = os.path.join(os.path.dirname(dst_file.name), str(uuid.uuid4()))
# try:
# os.rename(dst_file.name, temp_location)
# os.link(src_file.name, dst_file.name)
# dst_file = None
# os.unlink(temp_location)
# except OSError:
# pass
# if os.path.exists(temp_location):
# os.rename(temp_location, dst_file.name) # type: ignore
if dst_file and hasattr(dst_file, "name") and hasattr(src_file, "name"):
temp_location = os.path.join(os.path.dirname(dst_file.name), str(uuid.uuid4()))
try:
os.rename(dst_file.name, temp_location)
os.link(src_file.name, dst_file.name)
dst_file = None
os.unlink(temp_location)
except OSError:
pass
if os.path.exists(temp_location):
os.rename(temp_location, dst_file.name) # type: ignore

return content_processor(contents, src_file, dst_file, checksum, buffersize)


def content_processor(contents, src_file, dst_file, checksum, buffersize):
while contents != b"":
if dst_file is not None:
dst_file.write(contents)
Expand All @@ -1064,24 +1072,22 @@ def checksum_only(
hasher=Hasher, # type: Callable[[], hashlib._Hash]
buffersize: int = 1024 * 1024,
) -> str:
# TODO, one level up with a provenance -no-data option?
# First step, force dst_file to be none so it computes the checksum but does not write it to its destination
_logger.error("Hard force for dst_file to be None")
_logger.error("src_file: %s", src_file)
dst_file = None

if dst_file != None:
_logger.error("Destination file should be None but it is %s", dst_file)

"""Compute checksums while copying a file."""
# TODO: Use hashlib.new(Hasher_str) instead?
checksum = hasher()
contents = src_file.read(buffersize)

# TODO Could be a function for both checksum_only and checksum_copy?
while contents != b"":
if dst_file is not None:
_logger.error("WRITING!!! %s", dst_file)
dst_file.write(contents)
checksum.update(contents)
contents = src_file.read(buffersize)
if dst_file is not None:
dst_file.flush()
return checksum.hexdigest().lower()
return content_processor(contents, src_file, dst_file, checksum, buffersize)
# while contents != b"":
# if dst_file is not None:
# dst_file.write(contents)
# checksum.update(contents)
# contents = src_file.read(buffersize)
# if dst_file is not None:
# dst_file.flush()
# return checksum.hexdigest().lower()

0 comments on commit 211348a

Please sign in to comment.