From 8b783dce9870ff2bb5331552ff730218419f247c Mon Sep 17 00:00:00 2001 From: paulfouquet <86932794+paulfouquet@users.noreply.github.com> Date: Fri, 24 May 2024 09:37:58 +1200 Subject: [PATCH] fix: should use unique file names when copying files to standardise TDE-1186 (#974) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Alice Fage #### Motivation When a source input contains more than one file with the same name, standardising script should not overwrite them at the copy but copy them all. #### Modification Generate a filename based on the multihash of the input file original path #### Checklist - [ ] Tests updated - [x] Docs updated - [x] Issue linked in Title --- scripts/files/fs.py | 28 ++++++++++++++++++---------- scripts/translate_ascii.py | 2 +- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/scripts/files/fs.py b/scripts/files/fs.py index 359846c75..2b92a1984 100644 --- a/scripts/files/fs.py +++ b/scripts/files/fs.py @@ -9,6 +9,7 @@ from scripts.aws.aws_helper import is_s3 from scripts.files import fs_local, fs_s3 +from scripts.stac.util.checksum import multihash_as_hex if TYPE_CHECKING: from mypy_boto3_s3 import S3Client @@ -93,20 +94,22 @@ def modified(path: str, s3_client: Optional[S3Client] = None) -> datetime: return fs_local.modified(Path(path)) -def write_all(inputs: List[str], target: str, concurrency: Optional[int] = 4) -> List[str]: +def write_all( + inputs: List[str], target: str, concurrency: Optional[int] = 4, generate_name: Optional[bool] = True +) -> List[str]: """Writes list of files to target destination using multithreading. - Args: inputs: list of files to read target: target folder to write to concurrency: max thread pool workers + generated_name: create a target file name based on multihash the source filename Returns: list of written file paths """ written_tiffs: List[str] = [] with ThreadPoolExecutor(max_workers=concurrency) as executor: - futuress = {write_file(executor, input_, target): input_ for input_ in inputs} + futuress = {write_file(executor, input_, target, generate_name): input_ for input_ in inputs} for future in as_completed(futuress): if future.exception(): get_log().warn("Failed Read-Write", error=future.exception()) @@ -121,15 +124,13 @@ def write_all(inputs: List[str], target: str, concurrency: Optional[int] = 4) -> def write_sidecars(inputs: List[str], target: str, concurrency: Optional[int] = 4) -> None: - """Writes list of files to target destination using multithreading. + """Writes list of files (if found) to target destination using multithreading. + The copy of the files have a generated file name (@see `write_file`) Args: inputs: list of files to read target: target folder to write to concurrency: max thread pool workers - - Returns: - """ with ThreadPoolExecutor(max_workers=concurrency) as executor: results = {write_file(executor, input_, target): input_ for input_ in inputs} @@ -141,20 +142,27 @@ def write_sidecars(inputs: List[str], target: str, concurrency: Optional[int] = get_log().info("wrote_sidecar_file", path=future.result()) -def write_file(executor: ThreadPoolExecutor, input_: str, target: str) -> Future[str]: +def write_file(executor: ThreadPoolExecutor, input_: str, target: str, generate_name: Optional[bool] = True) -> Future[str]: """Read a file from a path and write it to a target path. - Args: executor: A ThreadPoolExecutor instance. input_: A path to a file to read. target: A path to write the file to. + generate_name: create a target file name based on multihash the source filename Returns: Future[str]: The result of the execution. """ get_log().info(f"Trying write from file: {input_}") + + if generate_name: + file_name, file_extension = os.path.splitext(input_) + target_file_name = f"{multihash_as_hex(str.encode(file_name))}{file_extension}" + else: + target_file_name = os.path.basename(input_) + try: - return executor.submit(copy, input_, os.path.join(target, f"{os.path.basename(input_)}")) + return executor.submit(copy, input_, os.path.join(target, target_file_name)) except NoSuchFileError as nsfe: future: Future[str] = Future() future.set_exception(nsfe) diff --git a/scripts/translate_ascii.py b/scripts/translate_ascii.py index 9d22a4111..c1e7bbd31 100644 --- a/scripts/translate_ascii.py +++ b/scripts/translate_ascii.py @@ -51,7 +51,7 @@ def main() -> None: p.close() p.join() - write_all(inputs=tiffs, target=arguments.target) + write_all(inputs=tiffs, target=arguments.target, generate_name=False) get_log().info("ascii file translation complete", duration=time_in_ms() - start_time, count=len(tiffs)) # copy any sidecar files to target