Skip to content

Commit

Permalink
fix: The hash_cache fails with filenames including surrogates (#492)
Browse files Browse the repository at this point in the history
Cases encountered in practice show that filenames can include unicode
surrogates. This changes the hash_cache functionality to preserve them
by using the "surrogatepass" handler for them.

Another issue encountered is that S3 metadata only allows ASCII, but the
code was placing unicode paths in it.

* Modify the hash_cache sqlite schema to use a BLOB instead of TEXT for
  the file_path, and then explicitly encode/decode it using the 'utf-8'
  encoding and 'surrogatepass' error handler.
* In the canonical json code, use the 'surrogatepass' error handler in
  the canonical file path sorting. Add the new test case data to the
  test manifest file.
* In the error logging of the exception that revealed this bug, include
  the exception context to improve debugging of future similar issues.
* Add a test case for the hash_cache based on an example that was
  failing in practice.
* Add the "asset-root-json" metadata containing the path encoded to
  ASCII via a JSON string. If the file path is not ASCII, it adds this
  metadata instead.

Signed-off-by: Mark Wiebe <[email protected]>
  • Loading branch information
mwiebe authored Nov 5, 2024
1 parent d9e70d2 commit 39c285d
Show file tree
Hide file tree
Showing 9 changed files with 56 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,7 @@ def handle_thread_exception(self, e: BaseException) -> None:
self.summary_edit.setText(f"Error occurred: {str(e)}")
self.summary_edit.setVisible(True)
self.adjustSize()
logger.error(str(e))
logger.exception(e, exc_info=(type(e), e, e.__traceback__))

def _confirm_asset_references_outside_storage_profile(
self, upload_group: AssetUploadGroup
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ def canonical_path_comparator(path: BaseManifestPath):
"""
# Sort by UTF-16 values as per the spec
# https://www.rfc-editor.org/rfc/rfc8785.html#name-sorting-of-object-propertie
return path.path.encode("utf-16_be")
# Use the "surrogatepass" error handler because filenames encountered in the wild
# include surrogates.
return path.path.encode("utf-16_be", errors="surrogatepass")


def manifest_to_canonical_json_string(manifest: BaseAssetManifest) -> str:
Expand Down
12 changes: 11 additions & 1 deletion src/deadline/job_attachments/asset_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import shutil
import sys
import time
import json
from io import BytesIO
from logging import Logger, LoggerAdapter, getLogger
from math import trunc
Expand Down Expand Up @@ -527,7 +528,16 @@ def _upload_output_manifest_to_s3(
full_output_prefix,
f"{manifest_name_prefix}_output",
)
metadata = {"Metadata": {"asset-root": root_path}}
metadata = {"Metadata": {"asset-root": json.dumps(root_path, ensure_ascii=True)}}
# S3 metadata must be ASCII, so use either 'asset-root' or 'asset-root-json' depending
# on whether the value is ASCII.
try:
# Add the 'asset-root' metadata if the path is ASCII
root_path.encode(encoding="ascii")
metadata["Metadata"]["asset-root"] = root_path
except UnicodeEncodeError:
# Add the 'asset-root-json' metadata encoded to ASCII as a JSON string
metadata["Metadata"]["asset-root-json"] = json.dumps(root_path, ensure_ascii=True)
if file_system_location_name:
metadata["Metadata"]["file-system-location-name"] = file_system_location_name

Expand Down
19 changes: 14 additions & 5 deletions src/deadline/job_attachments/caches/hash_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
class HashCacheEntry:
"""Represents an entry in the local hash-cache database"""

# The file_path is stored as a BLOB in sqlite, encoded with utf-8 and the "surrogatepass"
# error handler, as file names encountered in practice require this.
file_path: str
hash_algorithm: HashAlgorithm
file_hash: str
Expand All @@ -45,12 +47,12 @@ class HashCache(CacheDB):
"""

CACHE_NAME = "hash_cache"
CACHE_DB_VERSION = 2
CACHE_DB_VERSION = 3

def __init__(self, cache_dir: Optional[str] = None) -> None:
table_name: str = f"hashesV{self.CACHE_DB_VERSION}"
create_query: str = (
f"CREATE TABLE hashesV{self.CACHE_DB_VERSION}(file_path text primary key, hash_algorithm text secondary key, file_hash text, last_modified_time timestamp)"
f"CREATE TABLE hashesV{self.CACHE_DB_VERSION}(file_path blob primary key, hash_algorithm text secondary key, file_hash text, last_modified_time timestamp)"
)
super().__init__(
cache_name=self.CACHE_NAME,
Expand All @@ -71,11 +73,14 @@ def get_entry(
with self.db_lock, self.db_connection:
entry_vals = self.db_connection.execute(
f"SELECT * FROM {self.table_name} WHERE file_path=? AND hash_algorithm=?",
[file_path_key, hash_algorithm.value],
[
file_path_key.encode(encoding="utf-8", errors="surrogatepass"),
hash_algorithm.value,
],
).fetchone()
if entry_vals:
return HashCacheEntry(
file_path=entry_vals[0],
file_path=str(entry_vals[0], encoding="utf-8", errors="surrogatepass"),
hash_algorithm=HashAlgorithm(entry_vals[1]),
file_hash=entry_vals[2],
last_modified_time=str(entry_vals[3]),
Expand All @@ -87,7 +92,11 @@ def put_entry(self, entry: HashCacheEntry) -> None:
"""Inserts or replaces an entry into the hash cache database after acquiring the lock."""
if self.enabled:
with self.db_lock, self.db_connection:
entry_dict = entry.to_dict()
entry_dict["file_path"] = entry_dict["file_path"].encode(
encoding="utf-8", errors="surrogatepass"
)
self.db_connection.execute(
f"INSERT OR REPLACE INTO {self.table_name} VALUES(:file_path, :hash_algorithm, :file_hash, :last_modified_time)",
entry.to_dict(),
entry_dict,
)
8 changes: 6 additions & 2 deletions src/deadline/job_attachments/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import concurrent.futures
import io
import json
import os
import re
import sys
Expand Down Expand Up @@ -640,7 +641,7 @@ def _get_asset_root_from_s3(
) -> Optional[str]:
"""
Gets asset root from metadata (of output manifest) stored in S3.
If the key "asset-root" does not exist in the metadata, returns None.
If neither of the keys "asset-root-json" or "asset-root" exist in the metadata, returns None.
"""
s3_client = get_s3_client(session=session)
try:
Expand Down Expand Up @@ -670,7 +671,10 @@ def _get_asset_root_from_s3(
except Exception as e:
raise AssetSyncError(e) from e

return head["Metadata"].get("asset-root", None)
if "asset-root-json" in head["Metadata"]:
return json.loads(head["Metadata"]["asset-root-json"])
else:
return head["Metadata"].get("asset-root", None)


def get_job_output_paths_by_asset_root(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,15 @@ def test_decode_manifest_v2023_03_03(default_manifest_str_v2023_03_03: str):
Path_v2023_03_03(path="test_dir/test_file", hash="b", size=1, mtime=1479079344833848),
Path_v2023_03_03(path="test_file", hash="a", size=1, mtime=167907934333848),
Path_v2023_03_03(path="\u0080", hash="Control", size=1, mtime=1679079344833348),
Path_v2023_03_03(
path="\u00c3\u00b1", hash="UserTestCase", size=1, mtime=1679579344833848
),
Path_v2023_03_03(
path="ö", hash="LatinSmallLetterOWithDiaeresis", size=1, mtime=1679079344833848
),
Path_v2023_03_03(path="€", hash="EuroSign", size=1, mtime=1679079344836848),
Path_v2023_03_03(path="😀", hash="EmojiGrinningFace", size=1, mtime=1679579344833848),
Path_v2023_03_03(path="\ude0a", hash="EmojiTestCase", size=1, mtime=1679579344833848),
Path_v2023_03_03(
path="דּ", hash="HebrewLetterDaletWithDagesh", size=1, mtime=1679039344833848
),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,13 @@ def test_decode(default_manifest_str_v2023_03_03: str):
ManifestPath(path="test_dir/test_file", hash="b", size=1, mtime=1479079344833848),
ManifestPath(path="test_file", hash="a", size=1, mtime=167907934333848),
ManifestPath(path="\u0080", hash="Control", size=1, mtime=1679079344833348),
ManifestPath(path="\u00c3\u00b1", hash="UserTestCase", size=1, mtime=1679579344833848),
ManifestPath(
path="ö", hash="LatinSmallLetterOWithDiaeresis", size=1, mtime=1679079344833848
),
ManifestPath(path="€", hash="EuroSign", size=1, mtime=1679079344836848),
ManifestPath(path="😀", hash="EmojiGrinningFace", size=1, mtime=1679579344833848),
ManifestPath(path="\ude0a", hash="EmojiTestCase", size=1, mtime=1679579344833848),
ManifestPath(
path="דּ", hash="HebrewLetterDaletWithDagesh", size=1, mtime=1679039344833848
),
Expand Down
17 changes: 14 additions & 3 deletions test/unit/deadline_job_attachments/caches/test_caches.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,14 +86,25 @@ def test_init_empty_path(self, tmpdir):
hc = HashCache()
assert hc.cache_dir == tmpdir.join(f"{HashCache.CACHE_NAME}.db")

def test_get_entry_returns_valid_entry(self, tmpdir):
@pytest.mark.parametrize(
"file_path",
[
# Simple ascii filename
pytest.param("file", id="ascii_name"),
# Name from test case that was failing on Windows for a user
pytest.param("ñ/\u00c3\u00b1.txt", id="regression_test_filename"),
# Name from a generated emoji filename on Windows
pytest.param("\ude0a.txt", id="surrogate_emoji_example"),
],
)
def test_get_entry_returns_valid_entry(self, tmpdir, file_path):
"""
Tests that a valid entry is returned when it exists in the cache already
"""
# GIVEN
cache_dir = tmpdir.mkdir("cache")
expected_entry = HashCacheEntry(
file_path="file",
file_path=file_path,
hash_algorithm=HashAlgorithm.XXH128,
file_hash="hash",
last_modified_time="1234.5678",
Expand All @@ -102,7 +113,7 @@ def test_get_entry_returns_valid_entry(self, tmpdir):
# WHEN
with HashCache(cache_dir) as hc:
hc.put_entry(expected_entry)
actual_entry = hc.get_entry("file", HashAlgorithm.XXH128)
actual_entry = hc.get_entry(file_path, HashAlgorithm.XXH128)

# THEN
assert actual_entry == expected_entry
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"hashAlg":"xxh128","manifestVersion":"2023-03-03","paths":[{"hash":"CarriageReturn","mtime":1679079744833848,"path":"\r","size":1},{"hash":"One","mtime":1679079344833868,"path":"1","size":1},{"hash":"c","mtime":1675079344833848,"path":"another_test_file","size":1},{"hash":"b","mtime":1479079344833848,"path":"test_dir/test_file","size":1},{"hash":"a","mtime":167907934333848,"path":"test_file","size":1},{"hash":"Control","mtime":1679079344833348,"path":"\u0080","size":1},{"hash":"LatinSmallLetterOWithDiaeresis","mtime":1679079344833848,"path":"\u00f6","size":1},{"hash":"EuroSign","mtime":1679079344836848,"path":"\u20ac","size":1},{"hash":"EmojiGrinningFace","mtime":1679579344833848,"path":"\ud83d\ude00","size":1},{"hash":"HebrewLetterDaletWithDagesh","mtime":1679039344833848,"path":"\ufb33","size":1}],"totalSize":10}
{"hashAlg":"xxh128","manifestVersion":"2023-03-03","paths":[{"hash":"CarriageReturn","mtime":1679079744833848,"path":"\r","size":1},{"hash":"One","mtime":1679079344833868,"path":"1","size":1},{"hash":"c","mtime":1675079344833848,"path":"another_test_file","size":1},{"hash":"b","mtime":1479079344833848,"path":"test_dir/test_file","size":1},{"hash":"a","mtime":167907934333848,"path":"test_file","size":1},{"hash":"Control","mtime":1679079344833348,"path":"\u0080","size":1},{"hash":"UserTestCase","mtime":1679579344833848,"path":"\u00c3\u00b1","size":1},{"hash":"LatinSmallLetterOWithDiaeresis","mtime":1679079344833848,"path":"\u00f6","size":1},{"hash":"EuroSign","mtime":1679079344836848,"path":"\u20ac","size":1},{"hash":"EmojiGrinningFace","mtime":1679579344833848,"path":"\ud83d\ude00","size":1},{"hash":"EmojiTestCase","mtime":1679579344833848,"path":"\ude0a","size":1},{"hash":"HebrewLetterDaletWithDagesh","mtime":1679039344833848,"path":"\ufb33","size":1}],"totalSize":10}

0 comments on commit 39c285d

Please sign in to comment.