Skip to content

Commit

Permalink
Fix error with max filename length (#55)
Browse files Browse the repository at this point in the history
  • Loading branch information
Joel Wright authored May 24, 2023
1 parent c0a56c1 commit fb629ff
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 29 deletions.
21 changes: 16 additions & 5 deletions src/navigator_data_ingest/base/api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def get_machine_user_token():
def upload_document(
session: requests.Session,
source_url: str,
s3_prefix: str,
file_name_without_suffix: str,
document_bucket: str,
import_id: str,
Expand Down Expand Up @@ -95,21 +96,31 @@ def upload_document(

# Calculate the m5sum & update the result object with the calculated value
file_content = download_response.content
file_content_hash = hashlib.md5(file_content).hexdigest()
upload_result.md5_sum = file_content_hash
file_hash = hashlib.md5(file_content).hexdigest()
upload_result.md5_sum = file_hash

# ext4 used in Amazon Linux /tmp directory has a max filename length of
# 255 bytes, so trim to ensure we don't exceed that. Choose 240 initially to
# allow for suffix.
file_name_max_fs_len_no_suffix = file_name_without_suffix[:240]
while len(file_name_max_fs_len_no_suffix.encode("utf-8")) > 240:
file_name_max_fs_len_no_suffix = file_name_max_fs_len_no_suffix[:-5]

# s3 can only handle paths of up to 1024 bytes. To ensure we don't exceed that,
# we trim the filename if it's too long
file_suffix = FILE_EXTENSION_MAPPING.get(content_type, "")
filename_max_len = (
1024
- len(s3_prefix)
- len(file_suffix)
- len(file_content_hash)
- len(file_hash)
- len("_.") # length of additional characters for joining path components
)
file_name_without_suffix_trimmed = file_name_without_suffix[:filename_max_len]
file_name_no_suffix_trimmed = file_name_max_fs_len_no_suffix[:filename_max_len]
# Safe not to loop over the encoding of file_name because everything we're
# adding is 1byte == 1char
file_name = (
f"{file_name_without_suffix_trimmed}_{file_content_hash}{file_suffix}"
f"{s3_prefix}/{file_name_no_suffix_trimmed}_{file_hash}{file_suffix}"
)

_LOGGER.info(
Expand Down
43 changes: 19 additions & 24 deletions src/navigator_data_ingest/base/new_document_actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,35 +77,30 @@ def _upload_document(
doc_slug = slugify(document.name)
doc_geo = document.geography
doc_year = document.publication_ts.year
file_name = f"{doc_geo}/{doc_year}/{doc_slug}"
s3_prefix = f"{doc_geo}/{doc_year}"

if not document.source_url and not document.download_url:
_LOGGER.info(
f"Skipping upload for '{document.source}:{document.import_id}:"
f"{document.name}' because both the source URL and download URL are empty."
)
return UploadResult(
cdn_object=None,
md5_sum=None,
content_type=None,
)

def get_url_to_use(_document: Document) -> str:
"""
Get the URL to use for when downloading a document.
We use the download_url (cached document) if one is provided or default to the source_url.
"""
if _document.download_url is not None:
return _document.download_url
return _document.source_url.split("|")[0].strip()
if not document.download_url:
if not document.source_url:
_LOGGER.info(
f"Skipping upload for '{document.source}:{document.import_id}:"
f"{document.name}' because both the source URL and download URL are empty."
)
return UploadResult(
cdn_object=None,
md5_sum=None,
content_type=None,
)
file_download_source = document.source_url
else:
file_download_source = document.download_url

return upload_document(
session,
get_url_to_use(document),
file_name,
file_download_source,
s3_prefix,
doc_slug,
document_bucket,
document.import_id
document.import_id,
)


Expand Down

0 comments on commit fb629ff

Please sign in to comment.