From 2096b2f335de4d4fc52ec2edce0f9e17282dfebb Mon Sep 17 00:00:00 2001 From: Victor Engmark Date: Thu, 2 May 2024 14:26:03 +1200 Subject: [PATCH 1/3] refactor: Reuse dictionary to JSON bytes code --- scripts/dict_to_json_bytes.py | 6 ++++++ scripts/stac/imagery/collection.py | 8 ++++---- scripts/standardise_validate.py | 4 ++-- 3 files changed, 12 insertions(+), 6 deletions(-) create mode 100644 scripts/dict_to_json_bytes.py diff --git a/scripts/dict_to_json_bytes.py b/scripts/dict_to_json_bytes.py new file mode 100644 index 000000000..8befb7995 --- /dev/null +++ b/scripts/dict_to_json_bytes.py @@ -0,0 +1,6 @@ +import json +from typing import Any, Dict + + +def dict_to_json_bytes(input_dict: Dict[str, Any], ensure_ascii: bool = True) -> bytes: + return json.dumps(input_dict, ensure_ascii=ensure_ascii).encode("utf-8") diff --git a/scripts/stac/imagery/collection.py b/scripts/stac/imagery/collection.py index 53733bfe4..f413bf2b3 100644 --- a/scripts/stac/imagery/collection.py +++ b/scripts/stac/imagery/collection.py @@ -1,4 +1,3 @@ -import json import os from typing import Any, Dict, List, Optional @@ -6,6 +5,7 @@ import ulid from scripts.datetimes import format_rfc_3339_datetime_string, parse_rfc_3339_datetime +from scripts.dict_to_json_bytes import dict_to_json_bytes from scripts.files.files_helper import ContentType from scripts.files.fs import write from scripts.stac.imagery.capture_area import generate_capture_area, gsd_to_float @@ -95,7 +95,7 @@ def add_capture_area(self, polygons: List[shapely.geometry.shape], target: str, # The GSD is measured in meters (e.g., `0.3m`) capture_area_document = generate_capture_area(polygons, gsd_to_float(self.metadata["gsd"])) - capture_area_content: bytes = json.dumps(capture_area_document).encode("utf-8") + capture_area_content: bytes = dict_to_json_bytes(capture_area_document) file_checksum = checksum.multihash_as_hex(capture_area_content) capture_area = { "href": f"./{CAPTURE_AREA_FILE_NAME}", @@ -129,7 +129,7 @@ def add_item(self, item: Dict[Any, Any]) -> None: item: STAC Item to add """ item_self_link = next((feat for feat in item["links"] if feat["rel"] == "self"), None) - file_checksum = checksum.multihash_as_hex(json.dumps(item).encode("utf-8")) + file_checksum = checksum.multihash_as_hex(dict_to_json_bytes(item)) if item_self_link: self.add_link(href=item_self_link["href"], file_checksum=file_checksum) self.update_temporal_extent(item["properties"]["start_datetime"], item["properties"]["end_datetime"]) @@ -235,7 +235,7 @@ def write_to(self, destination: str) -> None: Args: destination: path of the destination """ - write(destination, json.dumps(self.stac, ensure_ascii=False).encode("utf-8"), content_type=ContentType.JSON.value) + write(destination, dict_to_json_bytes(self.stac, ensure_ascii=False), content_type=ContentType.JSON.value) def _title(self) -> str: """Generates the title for imagery and elevation datasets. diff --git a/scripts/standardise_validate.py b/scripts/standardise_validate.py index 95b32142f..dcc828098 100644 --- a/scripts/standardise_validate.py +++ b/scripts/standardise_validate.py @@ -1,5 +1,4 @@ import argparse -import json import os import sys from typing import List @@ -8,6 +7,7 @@ from scripts.cli.cli_helper import InputParameterError, is_argo, load_input_files, valid_date from scripts.datetimes import format_rfc_3339_nz_midnight_datetime_string +from scripts.dict_to_json_bytes import dict_to_json_bytes from scripts.files.files_helper import SUFFIX_JSON, ContentType from scripts.files.fs import exists, write from scripts.gdal.gdal_helper import get_srs, get_vfs_path @@ -102,7 +102,7 @@ def main() -> None: item = create_item( file.get_path_standardised(), start_datetime, end_datetime, arguments.collection_id, file.get_gdalinfo() ) - write(stac_item_path, json.dumps(item.stac).encode("utf-8"), content_type=ContentType.GEOJSON.value) + write(stac_item_path, dict_to_json_bytes(item.stac), content_type=ContentType.GEOJSON.value) get_log().info("stac_saved", path=stac_item_path) From 9c266ba2034d4a7512f004a601c690a346df86ab Mon Sep 17 00:00:00 2001 From: Victor Engmark Date: Thu, 2 May 2024 15:56:24 +1200 Subject: [PATCH 2/3] feat: Always dump JSON to UTF-8 string Otherwise information will get lost when decoding. --- scripts/dict_to_json_bytes.py | 6 ------ scripts/json_codec.py | 15 +++++++++++++++ scripts/stac/imagery/collection.py | 4 ++-- scripts/standardise_validate.py | 2 +- 4 files changed, 18 insertions(+), 9 deletions(-) delete mode 100644 scripts/dict_to_json_bytes.py create mode 100644 scripts/json_codec.py diff --git a/scripts/dict_to_json_bytes.py b/scripts/dict_to_json_bytes.py deleted file mode 100644 index 8befb7995..000000000 --- a/scripts/dict_to_json_bytes.py +++ /dev/null @@ -1,6 +0,0 @@ -import json -from typing import Any, Dict - - -def dict_to_json_bytes(input_dict: Dict[str, Any], ensure_ascii: bool = True) -> bytes: - return json.dumps(input_dict, ensure_ascii=ensure_ascii).encode("utf-8") diff --git a/scripts/json_codec.py b/scripts/json_codec.py new file mode 100644 index 000000000..efa2405cc --- /dev/null +++ b/scripts/json_codec.py @@ -0,0 +1,15 @@ +import json +from typing import Any, Dict + + +def dict_to_json_bytes(input_dict: Dict[str, Any]) -> bytes: + """ + Try to convert a `dict` into UTF-8 encoded `bytes` representing a JSON dictionary + + Examples: + >>> dict_to_json_bytes({}) + b'{}' + >>> dict_to_json_bytes({"ā": "😀"}) # Unicode code points U+0101 and U+1F600 + b'{"\xc4\x81": "\xf0\x9f\x98\x80"}' + """ + return json.dumps(input_dict, ensure_ascii=False).encode("utf-8") diff --git a/scripts/stac/imagery/collection.py b/scripts/stac/imagery/collection.py index f413bf2b3..03dd8340c 100644 --- a/scripts/stac/imagery/collection.py +++ b/scripts/stac/imagery/collection.py @@ -5,9 +5,9 @@ import ulid from scripts.datetimes import format_rfc_3339_datetime_string, parse_rfc_3339_datetime -from scripts.dict_to_json_bytes import dict_to_json_bytes from scripts.files.files_helper import ContentType from scripts.files.fs import write +from scripts.json_codec import dict_to_json_bytes from scripts.stac.imagery.capture_area import generate_capture_area, gsd_to_float from scripts.stac.imagery.metadata_constants import ( DATA_CATEGORIES, @@ -235,7 +235,7 @@ def write_to(self, destination: str) -> None: Args: destination: path of the destination """ - write(destination, dict_to_json_bytes(self.stac, ensure_ascii=False), content_type=ContentType.JSON.value) + write(destination, dict_to_json_bytes(self.stac), content_type=ContentType.JSON.value) def _title(self) -> str: """Generates the title for imagery and elevation datasets. diff --git a/scripts/standardise_validate.py b/scripts/standardise_validate.py index dcc828098..cd0cee9fe 100644 --- a/scripts/standardise_validate.py +++ b/scripts/standardise_validate.py @@ -7,10 +7,10 @@ from scripts.cli.cli_helper import InputParameterError, is_argo, load_input_files, valid_date from scripts.datetimes import format_rfc_3339_nz_midnight_datetime_string -from scripts.dict_to_json_bytes import dict_to_json_bytes from scripts.files.files_helper import SUFFIX_JSON, ContentType from scripts.files.fs import exists, write from scripts.gdal.gdal_helper import get_srs, get_vfs_path +from scripts.json_codec import dict_to_json_bytes from scripts.stac.imagery.create_stac import create_item from scripts.standardising import run_standardising From b15943ce420f3fbaafa8b8bf6f078adf213f1531 Mon Sep 17 00:00:00 2001 From: Victor Engmark Date: Fri, 3 May 2024 11:17:50 +1200 Subject: [PATCH 3/3] test: Verify that inverting JSON conversion works --- scripts/json_codec.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/json_codec.py b/scripts/json_codec.py index efa2405cc..2318ded75 100644 --- a/scripts/json_codec.py +++ b/scripts/json_codec.py @@ -11,5 +11,7 @@ def dict_to_json_bytes(input_dict: Dict[str, Any]) -> bytes: b'{}' >>> dict_to_json_bytes({"ā": "😀"}) # Unicode code points U+0101 and U+1F600 b'{"\xc4\x81": "\xf0\x9f\x98\x80"}' + >>> json.loads(dict_to_json_bytes({"ā": "😀"})) + {'ā': '😀'} """ return json.dumps(input_dict, ensure_ascii=False).encode("utf-8")