From e070ce19096dd639be6ff5b3751ccf33c02cf7c5 Mon Sep 17 00:00:00 2001 From: NextGenEng <58440325+THOR300@users.noreply.github.com> Date: Wed, 12 Apr 2023 09:38:46 +0100 Subject: [PATCH] Updating the regex to be tighter. (#36) Ensuring we don't pick up undesired files due to a looser regex. --------- Co-authored-by: Mark --- .../base/updated_document_actions.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/navigator_data_ingest/base/updated_document_actions.py b/src/navigator_data_ingest/base/updated_document_actions.py index f39e14c3..8fc86dec 100644 --- a/src/navigator_data_ingest/base/updated_document_actions.py +++ b/src/navigator_data_ingest/base/updated_document_actions.py @@ -20,6 +20,15 @@ _LOGGER = logging.getLogger(__file__) +def get_document_files( + prefix_path: S3Path, document_id: str, suffix_filter: str +) -> List[S3Path]: + """Get the document files for a given document ID found in an s3 directory.""" + return list(prefix_path.glob(f"{document_id}.{suffix_filter}")) + list( + prefix_path.glob(f"{document_id}_translated_*.{suffix_filter}") + ) + + def handle_document_updates( executor: Executor, source: Generator[Tuple[str, List[Update]], None, None], @@ -144,7 +153,9 @@ def update_dont_parse( ), ]: # Might be translated and non-translated json objects - document_files = list(prefix_path.glob(f"{document_id}*.json")) + document_files = get_document_files( + prefix_path, document_id, suffix_filter="json" + ) for document_file in document_files: errors.append( update_file_field( @@ -218,7 +229,9 @@ def parse( ), ]: # Might be translated and non-translated json objects - document_files = list(prefix_path.glob(f"{document_id}*.json")) + document_files = get_document_files( + prefix_path, document_id, suffix_filter="json" + ) for document_file in document_files: errors.append( update_file_field( @@ -239,7 +252,7 @@ def parse( ) # Might be translated and non-translated json objects - document_files = list(prefix_path.glob(f"{document_id}*.*")) + document_files = get_document_files(prefix_path, document_id, suffix_filter="*") for document_file in document_files: errors.append( rename(