From fb629ff147231f78b8006eb2da0135dfb7df3900 Mon Sep 17 00:00:00 2001 From: Joel Wright Date: Wed, 24 May 2023 21:10:08 +0100 Subject: [PATCH] Fix error with max filename length (#55) --- src/navigator_data_ingest/base/api_client.py | 21 ++++++--- .../base/new_document_actions.py | 43 ++++++++----------- 2 files changed, 35 insertions(+), 29 deletions(-) diff --git a/src/navigator_data_ingest/base/api_client.py b/src/navigator_data_ingest/base/api_client.py index a2a2c85..e1d0582 100644 --- a/src/navigator_data_ingest/base/api_client.py +++ b/src/navigator_data_ingest/base/api_client.py @@ -57,6 +57,7 @@ def get_machine_user_token(): def upload_document( session: requests.Session, source_url: str, + s3_prefix: str, file_name_without_suffix: str, document_bucket: str, import_id: str, @@ -95,21 +96,31 @@ def upload_document( # Calculate the m5sum & update the result object with the calculated value file_content = download_response.content - file_content_hash = hashlib.md5(file_content).hexdigest() - upload_result.md5_sum = file_content_hash + file_hash = hashlib.md5(file_content).hexdigest() + upload_result.md5_sum = file_hash + + # ext4 used in Amazon Linux /tmp directory has a max filename length of + # 255 bytes, so trim to ensure we don't exceed that. Choose 240 initially to + # allow for suffix. + file_name_max_fs_len_no_suffix = file_name_without_suffix[:240] + while len(file_name_max_fs_len_no_suffix.encode("utf-8")) > 240: + file_name_max_fs_len_no_suffix = file_name_max_fs_len_no_suffix[:-5] # s3 can only handle paths of up to 1024 bytes. To ensure we don't exceed that, # we trim the filename if it's too long file_suffix = FILE_EXTENSION_MAPPING.get(content_type, "") filename_max_len = ( 1024 + - len(s3_prefix) - len(file_suffix) - - len(file_content_hash) + - len(file_hash) - len("_.") # length of additional characters for joining path components ) - file_name_without_suffix_trimmed = file_name_without_suffix[:filename_max_len] + file_name_no_suffix_trimmed = file_name_max_fs_len_no_suffix[:filename_max_len] + # Safe not to loop over the encoding of file_name because everything we're + # adding is 1byte == 1char file_name = ( - f"{file_name_without_suffix_trimmed}_{file_content_hash}{file_suffix}" + f"{s3_prefix}/{file_name_no_suffix_trimmed}_{file_hash}{file_suffix}" ) _LOGGER.info( diff --git a/src/navigator_data_ingest/base/new_document_actions.py b/src/navigator_data_ingest/base/new_document_actions.py index 7c5418e..c130023 100644 --- a/src/navigator_data_ingest/base/new_document_actions.py +++ b/src/navigator_data_ingest/base/new_document_actions.py @@ -77,35 +77,30 @@ def _upload_document( doc_slug = slugify(document.name) doc_geo = document.geography doc_year = document.publication_ts.year - file_name = f"{doc_geo}/{doc_year}/{doc_slug}" + s3_prefix = f"{doc_geo}/{doc_year}" - if not document.source_url and not document.download_url: - _LOGGER.info( - f"Skipping upload for '{document.source}:{document.import_id}:" - f"{document.name}' because both the source URL and download URL are empty." - ) - return UploadResult( - cdn_object=None, - md5_sum=None, - content_type=None, - ) - - def get_url_to_use(_document: Document) -> str: - """ - Get the URL to use for when downloading a document. - - We use the download_url (cached document) if one is provided or default to the source_url. - """ - if _document.download_url is not None: - return _document.download_url - return _document.source_url.split("|")[0].strip() + if not document.download_url: + if not document.source_url: + _LOGGER.info( + f"Skipping upload for '{document.source}:{document.import_id}:" + f"{document.name}' because both the source URL and download URL are empty." + ) + return UploadResult( + cdn_object=None, + md5_sum=None, + content_type=None, + ) + file_download_source = document.source_url + else: + file_download_source = document.download_url return upload_document( session, - get_url_to_use(document), - file_name, + file_download_source, + s3_prefix, + doc_slug, document_bucket, - document.import_id + document.import_id, )