Skip to content

Commit

Permalink
fix: should use unique file names when copying files to standardise T…
Browse files Browse the repository at this point in the history
…DE-1186 (#974)

Co-authored-by: Alice Fage <[email protected]>

#### Motivation

When a source input contains more than one file with the same name,
standardising script should not overwrite them at the copy but copy them
all.

#### Modification

Generate a filename based on the multihash of the input file original
path

#### Checklist

- [ ] Tests updated
- [x] Docs updated
- [x] Issue linked in Title
  • Loading branch information
paulfouquet authored May 23, 2024
1 parent 719d96e commit 8b783dc
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 11 deletions.
28 changes: 18 additions & 10 deletions scripts/files/fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from scripts.aws.aws_helper import is_s3
from scripts.files import fs_local, fs_s3
from scripts.stac.util.checksum import multihash_as_hex

if TYPE_CHECKING:
from mypy_boto3_s3 import S3Client
Expand Down Expand Up @@ -93,20 +94,22 @@ def modified(path: str, s3_client: Optional[S3Client] = None) -> datetime:
return fs_local.modified(Path(path))


def write_all(inputs: List[str], target: str, concurrency: Optional[int] = 4) -> List[str]:
def write_all(
inputs: List[str], target: str, concurrency: Optional[int] = 4, generate_name: Optional[bool] = True
) -> List[str]:
"""Writes list of files to target destination using multithreading.
Args:
inputs: list of files to read
target: target folder to write to
concurrency: max thread pool workers
generated_name: create a target file name based on multihash the source filename
Returns:
list of written file paths
"""
written_tiffs: List[str] = []
with ThreadPoolExecutor(max_workers=concurrency) as executor:
futuress = {write_file(executor, input_, target): input_ for input_ in inputs}
futuress = {write_file(executor, input_, target, generate_name): input_ for input_ in inputs}
for future in as_completed(futuress):
if future.exception():
get_log().warn("Failed Read-Write", error=future.exception())
Expand All @@ -121,15 +124,13 @@ def write_all(inputs: List[str], target: str, concurrency: Optional[int] = 4) ->


def write_sidecars(inputs: List[str], target: str, concurrency: Optional[int] = 4) -> None:
"""Writes list of files to target destination using multithreading.
"""Writes list of files (if found) to target destination using multithreading.
The copy of the files have a generated file name (@see `write_file`)
Args:
inputs: list of files to read
target: target folder to write to
concurrency: max thread pool workers
Returns:
"""
with ThreadPoolExecutor(max_workers=concurrency) as executor:
results = {write_file(executor, input_, target): input_ for input_ in inputs}
Expand All @@ -141,20 +142,27 @@ def write_sidecars(inputs: List[str], target: str, concurrency: Optional[int] =
get_log().info("wrote_sidecar_file", path=future.result())


def write_file(executor: ThreadPoolExecutor, input_: str, target: str) -> Future[str]:
def write_file(executor: ThreadPoolExecutor, input_: str, target: str, generate_name: Optional[bool] = True) -> Future[str]:
"""Read a file from a path and write it to a target path.
Args:
executor: A ThreadPoolExecutor instance.
input_: A path to a file to read.
target: A path to write the file to.
generate_name: create a target file name based on multihash the source filename
Returns:
Future[str]: The result of the execution.
"""
get_log().info(f"Trying write from file: {input_}")

if generate_name:
file_name, file_extension = os.path.splitext(input_)
target_file_name = f"{multihash_as_hex(str.encode(file_name))}{file_extension}"
else:
target_file_name = os.path.basename(input_)

try:
return executor.submit(copy, input_, os.path.join(target, f"{os.path.basename(input_)}"))
return executor.submit(copy, input_, os.path.join(target, target_file_name))
except NoSuchFileError as nsfe:
future: Future[str] = Future()
future.set_exception(nsfe)
Expand Down
2 changes: 1 addition & 1 deletion scripts/translate_ascii.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def main() -> None:
p.close()
p.join()

write_all(inputs=tiffs, target=arguments.target)
write_all(inputs=tiffs, target=arguments.target, generate_name=False)
get_log().info("ascii file translation complete", duration=time_in_ms() - start_time, count=len(tiffs))

# copy any sidecar files to target
Expand Down

0 comments on commit 8b783dc

Please sign in to comment.