From 0b6158b5af17ae1e2177f10f92258f26c0211d92 Mon Sep 17 00:00:00 2001 From: Victor Engmark Date: Fri, 3 May 2024 03:21:21 +0000 Subject: [PATCH] refactor: Reuse dictionary to JSON bytes code TDE-1138 (#958) * refactor: Reuse dictionary to JSON bytes code * feat: Always dump JSON to UTF-8 string Otherwise information will get lost when decoding. * test: Verify that inverting JSON conversion works --- scripts/json_codec.py | 17 +++++++++++++++++ scripts/stac/imagery/collection.py | 8 ++++---- scripts/standardise_validate.py | 4 ++-- 3 files changed, 23 insertions(+), 6 deletions(-) create mode 100644 scripts/json_codec.py diff --git a/scripts/json_codec.py b/scripts/json_codec.py new file mode 100644 index 000000000..2318ded75 --- /dev/null +++ b/scripts/json_codec.py @@ -0,0 +1,17 @@ +import json +from typing import Any, Dict + + +def dict_to_json_bytes(input_dict: Dict[str, Any]) -> bytes: + """ + Try to convert a `dict` into UTF-8 encoded `bytes` representing a JSON dictionary + + Examples: + >>> dict_to_json_bytes({}) + b'{}' + >>> dict_to_json_bytes({"ā": "😀"}) # Unicode code points U+0101 and U+1F600 + b'{"\xc4\x81": "\xf0\x9f\x98\x80"}' + >>> json.loads(dict_to_json_bytes({"ā": "😀"})) + {'ā': '😀'} + """ + return json.dumps(input_dict, ensure_ascii=False).encode("utf-8") diff --git a/scripts/stac/imagery/collection.py b/scripts/stac/imagery/collection.py index 53733bfe4..03dd8340c 100644 --- a/scripts/stac/imagery/collection.py +++ b/scripts/stac/imagery/collection.py @@ -1,4 +1,3 @@ -import json import os from typing import Any, Dict, List, Optional @@ -8,6 +7,7 @@ from scripts.datetimes import format_rfc_3339_datetime_string, parse_rfc_3339_datetime from scripts.files.files_helper import ContentType from scripts.files.fs import write +from scripts.json_codec import dict_to_json_bytes from scripts.stac.imagery.capture_area import generate_capture_area, gsd_to_float from scripts.stac.imagery.metadata_constants import ( DATA_CATEGORIES, @@ -95,7 +95,7 @@ def add_capture_area(self, polygons: List[shapely.geometry.shape], target: str, # The GSD is measured in meters (e.g., `0.3m`) capture_area_document = generate_capture_area(polygons, gsd_to_float(self.metadata["gsd"])) - capture_area_content: bytes = json.dumps(capture_area_document).encode("utf-8") + capture_area_content: bytes = dict_to_json_bytes(capture_area_document) file_checksum = checksum.multihash_as_hex(capture_area_content) capture_area = { "href": f"./{CAPTURE_AREA_FILE_NAME}", @@ -129,7 +129,7 @@ def add_item(self, item: Dict[Any, Any]) -> None: item: STAC Item to add """ item_self_link = next((feat for feat in item["links"] if feat["rel"] == "self"), None) - file_checksum = checksum.multihash_as_hex(json.dumps(item).encode("utf-8")) + file_checksum = checksum.multihash_as_hex(dict_to_json_bytes(item)) if item_self_link: self.add_link(href=item_self_link["href"], file_checksum=file_checksum) self.update_temporal_extent(item["properties"]["start_datetime"], item["properties"]["end_datetime"]) @@ -235,7 +235,7 @@ def write_to(self, destination: str) -> None: Args: destination: path of the destination """ - write(destination, json.dumps(self.stac, ensure_ascii=False).encode("utf-8"), content_type=ContentType.JSON.value) + write(destination, dict_to_json_bytes(self.stac), content_type=ContentType.JSON.value) def _title(self) -> str: """Generates the title for imagery and elevation datasets. diff --git a/scripts/standardise_validate.py b/scripts/standardise_validate.py index 95b32142f..cd0cee9fe 100644 --- a/scripts/standardise_validate.py +++ b/scripts/standardise_validate.py @@ -1,5 +1,4 @@ import argparse -import json import os import sys from typing import List @@ -11,6 +10,7 @@ from scripts.files.files_helper import SUFFIX_JSON, ContentType from scripts.files.fs import exists, write from scripts.gdal.gdal_helper import get_srs, get_vfs_path +from scripts.json_codec import dict_to_json_bytes from scripts.stac.imagery.create_stac import create_item from scripts.standardising import run_standardising @@ -102,7 +102,7 @@ def main() -> None: item = create_item( file.get_path_standardised(), start_datetime, end_datetime, arguments.collection_id, file.get_gdalinfo() ) - write(stac_item_path, json.dumps(item.stac).encode("utf-8"), content_type=ContentType.GEOJSON.value) + write(stac_item_path, dict_to_json_bytes(item.stac), content_type=ContentType.GEOJSON.value) get_log().info("stac_saved", path=stac_item_path)