From fb629ff147231f78b8006eb2da0135dfb7df3900 Mon Sep 17 00:00:00 2001
From: Joel Wright <joel@climatepolicyradar.org>
Date: Wed, 24 May 2023 21:10:08 +0100
Subject: [PATCH] Fix error with max filename length (#55)

---
 src/navigator_data_ingest/base/api_client.py  | 21 ++++++---
 .../base/new_document_actions.py              | 43 ++++++++-----------
 2 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/src/navigator_data_ingest/base/api_client.py b/src/navigator_data_ingest/base/api_client.py
index a2a2c85..e1d0582 100644
--- a/src/navigator_data_ingest/base/api_client.py
+++ b/src/navigator_data_ingest/base/api_client.py
@@ -57,6 +57,7 @@ def get_machine_user_token():
 def upload_document(
     session: requests.Session,
     source_url: str,
+    s3_prefix: str,
     file_name_without_suffix: str,
     document_bucket: str,
     import_id: str,
@@ -95,21 +96,31 @@ def upload_document(
 
         # Calculate the m5sum & update the result object with the calculated value
         file_content = download_response.content
-        file_content_hash = hashlib.md5(file_content).hexdigest()
-        upload_result.md5_sum = file_content_hash
+        file_hash = hashlib.md5(file_content).hexdigest()
+        upload_result.md5_sum = file_hash
+
+        # ext4 used in Amazon Linux /tmp directory has a max filename length of
+        # 255 bytes, so trim to ensure we don't exceed that. Choose 240 initially to
+        # allow for suffix.
+        file_name_max_fs_len_no_suffix = file_name_without_suffix[:240]
+        while len(file_name_max_fs_len_no_suffix.encode("utf-8")) > 240:
+            file_name_max_fs_len_no_suffix = file_name_max_fs_len_no_suffix[:-5]
 
         # s3 can only handle paths of up to 1024 bytes. To ensure we don't exceed that,
         # we trim the filename if it's too long
         file_suffix = FILE_EXTENSION_MAPPING.get(content_type, "")
         filename_max_len = (
             1024
+            - len(s3_prefix)
             - len(file_suffix)
-            - len(file_content_hash)
+            - len(file_hash)
             - len("_.")  # length of additional characters for joining path components
         )
-        file_name_without_suffix_trimmed = file_name_without_suffix[:filename_max_len]
+        file_name_no_suffix_trimmed = file_name_max_fs_len_no_suffix[:filename_max_len]
+        # Safe not to loop over the encoding of file_name because everything we're
+        # adding is 1byte == 1char
         file_name = (
-            f"{file_name_without_suffix_trimmed}_{file_content_hash}{file_suffix}"
+            f"{s3_prefix}/{file_name_no_suffix_trimmed}_{file_hash}{file_suffix}"
         )
 
         _LOGGER.info(
diff --git a/src/navigator_data_ingest/base/new_document_actions.py b/src/navigator_data_ingest/base/new_document_actions.py
index 7c5418e..c130023 100644
--- a/src/navigator_data_ingest/base/new_document_actions.py
+++ b/src/navigator_data_ingest/base/new_document_actions.py
@@ -77,35 +77,30 @@ def _upload_document(
     doc_slug = slugify(document.name)
     doc_geo = document.geography
     doc_year = document.publication_ts.year
-    file_name = f"{doc_geo}/{doc_year}/{doc_slug}"
+    s3_prefix = f"{doc_geo}/{doc_year}"
 
-    if not document.source_url and not document.download_url:
-        _LOGGER.info(
-            f"Skipping upload for '{document.source}:{document.import_id}:"
-            f"{document.name}' because both the source URL and download URL are empty."
-        )
-        return UploadResult(
-            cdn_object=None,
-            md5_sum=None,
-            content_type=None,
-        )
-
-    def get_url_to_use(_document: Document) -> str:
-        """
-        Get the URL to use for when downloading a document.
-
-        We use the download_url (cached document) if one is provided or default to the source_url.
-        """
-        if _document.download_url is not None:
-            return _document.download_url
-        return _document.source_url.split("|")[0].strip()
+    if not document.download_url:
+        if not document.source_url:
+            _LOGGER.info(
+                f"Skipping upload for '{document.source}:{document.import_id}:"
+                f"{document.name}' because both the source URL and download URL are empty."
+            )
+            return UploadResult(
+                cdn_object=None,
+                md5_sum=None,
+                content_type=None,
+            )
+        file_download_source = document.source_url
+    else:
+        file_download_source = document.download_url
 
     return upload_document(
         session,
-        get_url_to_use(document),
-        file_name,
+        file_download_source,
+        s3_prefix,
+        doc_slug,
         document_bucket,
-        document.import_id
+        document.import_id,
     )