From e070ce19096dd639be6ff5b3751ccf33c02cf7c5 Mon Sep 17 00:00:00 2001
From: NextGenEng <58440325+THOR300@users.noreply.github.com>
Date: Wed, 12 Apr 2023 09:38:46 +0100
Subject: [PATCH] Updating the regex to be tighter. (#36)

Ensuring we don't pick up undesired files due to a looser regex.

---------

Co-authored-by: Mark <mark@climatepolicyradar.org>
---
 .../base/updated_document_actions.py          | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/navigator_data_ingest/base/updated_document_actions.py b/src/navigator_data_ingest/base/updated_document_actions.py
index f39e14c3..8fc86dec 100644
--- a/src/navigator_data_ingest/base/updated_document_actions.py
+++ b/src/navigator_data_ingest/base/updated_document_actions.py
@@ -20,6 +20,15 @@
 _LOGGER = logging.getLogger(__file__)
 
 
+def get_document_files(
+    prefix_path: S3Path, document_id: str, suffix_filter: str
+) -> List[S3Path]:
+    """Get the document files for a given document ID found in an s3 directory."""
+    return list(prefix_path.glob(f"{document_id}.{suffix_filter}")) + list(
+        prefix_path.glob(f"{document_id}_translated_*.{suffix_filter}")
+    )
+
+
 def handle_document_updates(
     executor: Executor,
     source: Generator[Tuple[str, List[Update]], None, None],
@@ -144,7 +153,9 @@ def update_dont_parse(
         ),
     ]:
         # Might be translated and non-translated json objects
-        document_files = list(prefix_path.glob(f"{document_id}*.json"))
+        document_files = get_document_files(
+            prefix_path, document_id, suffix_filter="json"
+        )
         for document_file in document_files:
             errors.append(
                 update_file_field(
@@ -218,7 +229,9 @@ def parse(
         ),
     ]:
         # Might be translated and non-translated json objects
-        document_files = list(prefix_path.glob(f"{document_id}*.json"))
+        document_files = get_document_files(
+            prefix_path, document_id, suffix_filter="json"
+        )
         for document_file in document_files:
             errors.append(
                 update_file_field(
@@ -239,7 +252,7 @@ def parse(
         )
 
         # Might be translated and non-translated json objects
-        document_files = list(prefix_path.glob(f"{document_id}*.*"))
+        document_files = get_document_files(prefix_path, document_id, suffix_filter="*")
         for document_file in document_files:
             errors.append(
                 rename(