From 581221774819cf2c85f239e608a2c3b36dfe106e Mon Sep 17 00:00:00 2001 From: Victor Engmark Date: Thu, 31 Oct 2024 15:48:44 +1300 Subject: [PATCH] refactor: Simplify `ImageryItem` constructor TDE-1298 By pulling out STAC asset and processing data types. --- scripts/stac/imagery/create_stac.py | 61 ++++++++-- scripts/stac/imagery/item.py | 55 ++++----- scripts/stac/imagery/tests/collection_test.py | 48 ++++---- scripts/stac/imagery/tests/generators.py | 34 ++++++ scripts/stac/imagery/tests/item_test.py | 110 +++++++----------- scripts/tests/collection_from_items_test.py | 4 +- scripts/tests/datetimes_test.py | 4 + 7 files changed, 179 insertions(+), 137 deletions(-) create mode 100644 scripts/stac/imagery/tests/generators.py diff --git a/scripts/stac/imagery/create_stac.py b/scripts/stac/imagery/create_stac.py index 51992a341..5346f2840 100644 --- a/scripts/stac/imagery/create_stac.py +++ b/scripts/stac/imagery/create_stac.py @@ -1,20 +1,23 @@ import json +import os from typing import Any from linz_logger import get_log from shapely.geometry.base import BaseGeometry -from scripts.datetimes import utc_now +from scripts.datetimes import format_rfc_3339_datetime_string, utc_now +from scripts.files import fs from scripts.files.files_helper import get_file_name_from_path -from scripts.files.fs import read +from scripts.files.fs import modified, read from scripts.files.geotiff import get_extents from scripts.gdal.gdal_helper import gdal_info from scripts.gdal.gdalinfo import GdalInfo from scripts.stac.imagery.collection import ImageryCollection -from scripts.stac.imagery.item import ImageryItem +from scripts.stac.imagery.item import ImageryItem, STACAsset, STACProcessing, STACProcessingSoftware from scripts.stac.imagery.metadata_constants import CollectionMetadata from scripts.stac.imagery.provider import Provider, ProviderRole from scripts.stac.link import Link, Relation +from scripts.stac.util import checksum from scripts.stac.util.media_type import StacMediaType @@ -77,7 +80,7 @@ def create_collection( def create_item( - file: str, + asset_path: str, start_datetime: str, end_datetime: str, collection_id: str, @@ -88,7 +91,7 @@ def create_item( """Create an ImageryItem (STAC) to be linked to a Collection. Args: - file: asset tiff file + asset_path: asset tiff file start_datetime: start date of the survey end_datetime: end date of the survey collection_id: collection id to link to the Item @@ -99,15 +102,13 @@ def create_item( Returns: a STAC Item wrapped in ImageryItem """ - id_ = get_file_name_from_path(file) + item = create_base_item(asset_path, gdal_version) if not gdalinfo_result: - gdalinfo_result = gdal_info(file) + gdalinfo_result = gdal_info(asset_path) geometry, bbox = get_extents(gdalinfo_result) - item = ImageryItem(id_, file, gdal_version, utc_now) - if derived_from is not None: for derived in derived_from: derived_item_content = read(derived) @@ -129,5 +130,45 @@ def create_item( item.update_spatial(geometry, bbox) item.add_collection(collection_id) - get_log().info("ImageryItem created", path=file) + get_log().info("ImageryItem created", path=asset_path) return item + + +def create_base_item(asset_path: str, gdal_version: str) -> ImageryItem: + """ + Args: + asset_path: asset tiff file path + gdal_version: GDAL version string + + Returns: + An ImageryItem with basic information. + """ + id_ = get_file_name_from_path(asset_path) + file_content = fs.read(asset_path) + file_modified_datetime = format_rfc_3339_datetime_string(modified(asset_path)) + + stac_asset = STACAsset( + **{ + "href": os.path.join(".", os.path.basename(asset_path)), + "file:checksum": checksum.multihash_as_hex(file_content), + "created": file_modified_datetime, + "updated": file_modified_datetime, + } + ) + + now_string = format_rfc_3339_datetime_string(utc_now()) + + if (topo_imagery_hash := os.environ.get("GIT_HASH")) is not None: + commit_url = f"https://github.com/linz/topo-imagery/commit/{topo_imagery_hash}" + else: + commit_url = "GIT_HASH not specified" + + stac_processing = STACProcessing( + **{ + "processing:datetime": now_string, + "processing:software": STACProcessingSoftware(**{"gdal": gdal_version, "linz/topo-imagery": commit_url}), + "processing:version": os.environ.get("GIT_VERSION", "GIT_VERSION not specified"), + } + ) + + return ImageryItem(id_, now_string, stac_asset, stac_processing) diff --git a/scripts/stac/imagery/item.py b/scripts/stac/imagery/item.py index 52cffad0e..c9ca76794 100644 --- a/scripts/stac/imagery/item.py +++ b/scripts/stac/imagery/item.py @@ -1,53 +1,40 @@ -import os -from collections.abc import Callable -from datetime import datetime -from os import environ -from typing import Any +from typing import Any, TypedDict -from scripts.datetimes import format_rfc_3339_datetime_string -from scripts.files import fs -from scripts.files.fs import modified from scripts.stac.link import Link, Relation -from scripts.stac.util import checksum from scripts.stac.util.STAC_VERSION import STAC_VERSION from scripts.stac.util.media_type import StacMediaType from scripts.stac.util.stac_extensions import StacExtensions +STACAsset = TypedDict("STACAsset", {"href": str, "file:checksum": str, "created": str, "updated": str}) + +STACProcessingSoftware = TypedDict("STACProcessingSoftware", {"gdal": str, "linz/topo-imagery": str}) +"""STAC Processing extension LINZ specific fields""" + +STACProcessing = TypedDict( + "STACProcessing", + { + "processing:datetime": str, + "processing:software": STACProcessingSoftware, + "processing:version": str, + }, +) +"""Some of the STAC processing extension fields are not declared in this TypedDict +(https://github.com/stac-extensions/processing?tab=readme-ov-file#fields) +""" + class ImageryItem: stac: dict[str, Any] - def __init__(self, id_: str, file: str, gdal_version: str, now: Callable[[], datetime]) -> None: - file_content = fs.read(file) - file_modified_datetime = format_rfc_3339_datetime_string(modified(file)) - now_string = format_rfc_3339_datetime_string(now()) - if (topo_imagery_hash := environ.get("GIT_HASH")) is not None: - commit_url = f"https://github.com/linz/topo-imagery/commit/{topo_imagery_hash}" - else: - commit_url = "GIT_HASH not specified" - + def __init__(self, id_: str, now_string: str, stac_asset: STACAsset, stac_processing: STACProcessing) -> None: self.stac = { "type": "Feature", "stac_version": STAC_VERSION, "id": id_, "links": [Link(path=f"./{id_}.json", rel=Relation.SELF, media_type=StacMediaType.GEOJSON).stac], - "assets": { - "visual": { - "href": os.path.join(".", os.path.basename(file)), - "type": "image/tiff; application=geotiff; profile=cloud-optimized", - "file:checksum": checksum.multihash_as_hex(file_content), - "created": file_modified_datetime, - "updated": file_modified_datetime, - } - }, + "assets": {"visual": {**stac_asset, "type": "image/tiff; application=geotiff; profile=cloud-optimized"}}, "stac_extensions": [StacExtensions.file.value, StacExtensions.processing.value], - "properties": { - "created": now_string, - "updated": now_string, - "processing:datetime": now_string, - "processing:software": {"gdal": gdal_version, "linz/topo-imagery": commit_url}, - "processing:version": environ.get("GIT_VERSION", "GIT_VERSION not specified"), - }, + "properties": {"created": now_string, "updated": now_string, **stac_processing}, } def update_datetime(self, start_datetime: str, end_datetime: str) -> None: diff --git a/scripts/stac/imagery/tests/collection_test.py b/scripts/stac/imagery/tests/collection_test.py index 1df578c31..7e6ab2f51 100644 --- a/scripts/stac/imagery/tests/collection_test.py +++ b/scripts/stac/imagery/tests/collection_test.py @@ -1,8 +1,6 @@ import json import os import tempfile -from collections.abc import Callable -from datetime import datetime, timezone from shutil import rmtree from tempfile import mkdtemp @@ -17,11 +15,12 @@ from scripts.files.fs import read from scripts.files.fs_s3 import write from scripts.stac.imagery.collection import ImageryCollection -from scripts.stac.imagery.item import ImageryItem +from scripts.stac.imagery.item import ImageryItem, STACAsset from scripts.stac.imagery.metadata_constants import CollectionMetadata from scripts.stac.imagery.provider import Provider, ProviderRole +from scripts.stac.imagery.tests.generators import any_stac_processing, fixed_now_function from scripts.stac.util.stac_extensions import StacExtensions -from scripts.tests.datetimes_test import any_epoch_datetime +from scripts.tests.datetimes_test import any_epoch_datetime, any_epoch_datetime_string def test_title_description_id_created_on_init(fake_collection_metadata: CollectionMetadata, subtests: SubTests) -> None: @@ -103,21 +102,25 @@ def test_interval_updated_from_existing(fake_collection_metadata: CollectionMeta assert collection.stac["extent"]["temporal"]["interval"] == [["2021-01-27T00:00:00Z", "2021-02-20T00:00:00Z"]] -def fixed_now_function(now: datetime) -> Callable[[], datetime]: - def func() -> datetime: - return now - - return func - - def test_add_item(fake_collection_metadata: CollectionMetadata, subtests: SubTests) -> None: now = any_epoch_datetime() - now_function = fixed_now_function(now) - collection = ImageryCollection(fake_collection_metadata, now_function) - item_file_path = "./scripts/tests/data/empty.tiff" - modified_datetime = datetime(2001, 2, 3, hour=4, minute=5, second=6, tzinfo=timezone.utc) - os.utime(item_file_path, times=(any_epoch_datetime().timestamp(), modified_datetime.timestamp())) - item = ImageryItem("BR34_5000_0304", item_file_path, "any GDAL version", now_function) + now_string = format_rfc_3339_datetime_string(now) + collection = ImageryCollection(fake_collection_metadata, fixed_now_function(now)) + asset_created_datetime = any_epoch_datetime_string() + asset_updated_datetime = any_epoch_datetime_string() + item = ImageryItem( + "BR34_5000_0304", + now_string, + STACAsset( + **{ + "href": "any href", + "file:checksum": "any checksum", + "created": asset_created_datetime, + "updated": asset_updated_datetime, + } + ), + any_stac_processing(), + ) geometry = { "type": "Polygon", "coordinates": [[1799667.5, 5815977.0], [1800422.5, 5815977.0], [1800422.5, 5814986.0], [1799667.5, 5814986.0]], @@ -150,13 +153,16 @@ def test_add_item(fake_collection_metadata: CollectionMetadata, subtests: SubTes for property_name in ["created", "updated"]: with subtests.test(msg=f"collection {property_name}"): - assert collection.stac[property_name] == format_rfc_3339_datetime_string(now) + assert collection.stac[property_name] == now_string with subtests.test(msg=f"item properties.{property_name}"): - assert item.stac["properties"][property_name] == format_rfc_3339_datetime_string(now) + assert item.stac["properties"][property_name] == now_string + + with subtests.test(msg="item assets.visual.created"): + assert item.stac["assets"]["visual"]["created"] == asset_created_datetime - with subtests.test(msg=f"item assets.visual.{property_name}"): - assert item.stac["assets"]["visual"][property_name] == "2001-02-03T04:05:06Z" + with subtests.test(msg="item assets.visual.updated"): + assert item.stac["assets"]["visual"]["updated"] == asset_updated_datetime def test_write_collection(fake_collection_metadata: CollectionMetadata) -> None: diff --git a/scripts/stac/imagery/tests/generators.py b/scripts/stac/imagery/tests/generators.py new file mode 100644 index 000000000..93cd6da5c --- /dev/null +++ b/scripts/stac/imagery/tests/generators.py @@ -0,0 +1,34 @@ +from datetime import datetime +from typing import Callable + +from scripts.stac.imagery.item import STACAsset, STACProcessing, STACProcessingSoftware + + +def fixed_now_function(now: datetime) -> Callable[[], datetime]: + def func() -> datetime: + return now + + return func + + +def any_stac_asset() -> STACAsset: + return STACAsset( + **{ + "href": "any href", + "file:checksum": "any checksum", + "created": "any created datetime", + "updated": "any updated datetime", + } + ) + + +def any_stac_processing() -> STACProcessing: + return STACProcessing( + **{ + "processing:datetime": "any processing datetime", + "processing:software": STACProcessingSoftware( + **{"gdal": "any GDAL version", "linz/topo-imagery": "any topo imagery version"} + ), + "processing:version": "any processing version", + } + ) diff --git a/scripts/stac/imagery/tests/item_test.py b/scripts/stac/imagery/tests/item_test.py index 0b0e3d4a3..908d9894b 100644 --- a/scripts/stac/imagery/tests/item_test.py +++ b/scripts/stac/imagery/tests/item_test.py @@ -1,97 +1,76 @@ -from datetime import datetime, timezone +from datetime import datetime from decimal import Decimal from os import environ from unittest.mock import patch -from pytest_mock import MockerFixture from pytest_subtests import SubTests from scripts.files.files_helper import get_file_name_from_path from scripts.stac.imagery.collection import ImageryCollection from scripts.stac.imagery.item import ImageryItem from scripts.stac.imagery.metadata_constants import CollectionMetadata -from scripts.stac.util.stac_extensions import StacExtensions -from scripts.tests.datetimes_test import any_epoch_datetime +from scripts.stac.imagery.tests.generators import any_stac_asset, any_stac_processing +from scripts.tests.datetimes_test import any_epoch_datetime, any_epoch_datetime_string -def test_imagery_stac_item(mocker: MockerFixture, subtests: SubTests) -> None: +def test_imagery_stac_item(subtests: SubTests) -> None: # mock functions that interact with files geometry = { "type": "Polygon", "coordinates": [[[1799667.5, 5815977.0], [1800422.5, 5815977.0], [1800422.5, 5814986.0], [1799667.5, 5814986.0]]], } bbox = (1799667.5, 5815977.0, 1800422.5, 5814986.0) - mocker.patch("scripts.files.fs.read", return_value=b"") path = "./scripts/tests/data/empty.tiff" id_ = get_file_name_from_path(path) start_datetime = "2021-01-27T00:00:00Z" end_datetime = "2021-01-27T00:00:00Z" - def fake_now() -> datetime: - return datetime(1979, 1, 1, tzinfo=timezone.utc) - git_hash = "any Git hash" git_version = "any Git version string" - gdal_version_string = "any GDAL version string" + asset = any_stac_asset() + now_string = any_epoch_datetime_string() + stac_processing = any_stac_processing() with patch.dict(environ, {"GIT_HASH": git_hash, "GIT_VERSION": git_version}): - item = ImageryItem(id_, path, gdal_version_string, fake_now) + item = ImageryItem(id_, now_string, asset, stac_processing) item.update_spatial(geometry, bbox) item.update_datetime(start_datetime, end_datetime) - # checks - with subtests.test(): - assert item.stac["id"] == id_ - - with subtests.test(): - assert item.stac["properties"]["start_datetime"] == start_datetime - - with subtests.test(): - assert item.stac["properties"]["end_datetime"] == end_datetime - - with subtests.test(): - assert item.stac["properties"]["datetime"] is None - - with subtests.test(): - assert ( - item.stac["properties"]["created"] - == item.stac["properties"]["updated"] - == item.stac["properties"]["processing:datetime"] - == "1979-01-01T00:00:00Z" - ) - - with subtests.test(): - assert item.stac["properties"]["processing:version"] == git_version + # checks with subtests.test(): - assert item.stac["properties"]["processing:software"] == { - "gdal": gdal_version_string, - "linz/topo-imagery": f"https://github.com/linz/topo-imagery/commit/{git_hash}", + assert item.stac == { + "assets": { + "visual": {**asset, "type": "image/tiff; application=geotiff; profile=cloud-optimized"}, + }, + "bbox": bbox, + "geometry": geometry, + "id": id_, + "links": [ + { + "href": "./empty.json", + "rel": "self", + "type": "application/geo+json", + }, + ], + "properties": { + "created": now_string, + "datetime": None, + "end_datetime": "2021-01-27T00:00:00Z", + "start_datetime": "2021-01-27T00:00:00Z", + "updated": now_string, + **stac_processing, + }, + "stac_extensions": [ + "https://stac-extensions.github.io/file/v2.0.0/schema.json", + "https://stac-extensions.github.io/processing/v1.2.0/schema.json", + ], + "stac_version": "1.0.0", + "type": "Feature", } - with subtests.test(): - assert item.stac["stac_extensions"] == [StacExtensions.file.value, StacExtensions.processing.value] - - with subtests.test(): - assert item.stac["geometry"]["coordinates"] == geometry["coordinates"] - - with subtests.test(): - assert item.stac["geometry"] == geometry - - with subtests.test(): - assert item.stac["bbox"] == bbox - - with subtests.test(): - assert ( - item.stac["assets"]["visual"]["file:checksum"] - == "1220e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" - ) - - with subtests.test(): - assert {"rel": "self", "href": f"./{id_}.json", "type": "application/geo+json"} in item.stac["links"] - # pylint: disable=duplicate-code -def test_imagery_add_collection(mocker: MockerFixture, subtests: SubTests) -> None: +def test_imagery_add_collection(subtests: SubTests) -> None: metadata: CollectionMetadata = { "category": "urban-aerial-photos", "region": "auckland", @@ -108,8 +87,7 @@ def test_imagery_add_collection(mocker: MockerFixture, subtests: SubTests) -> No path = "./scripts/tests/data/empty.tiff" id_ = get_file_name_from_path(path) - mocker.patch("scripts.files.fs.read", return_value=b"") - item = ImageryItem(id_, path, "any GDAL version", any_epoch_datetime) + item = ImageryItem(id_, any_epoch_datetime_string(), any_stac_asset(), any_stac_processing()) item.add_collection(collection.stac["id"]) @@ -121,13 +99,3 @@ def test_imagery_add_collection(mocker: MockerFixture, subtests: SubTests) -> No with subtests.test(): assert {"rel": "parent", "href": "./collection.json", "type": "application/json"} in item.stac["links"] - - -def test_should_set_fallback_version_strings(subtests: SubTests) -> None: - item = ImageryItem("any ID", "./scripts/tests/data/empty.tiff", "any GDAL version", any_epoch_datetime) - - with subtests.test(): - assert item.stac["properties"]["processing:software"]["linz/topo-imagery"] == "GIT_HASH not specified" - - with subtests.test(): - assert item.stac["properties"]["processing:version"] == "GIT_VERSION not specified" diff --git a/scripts/tests/collection_from_items_test.py b/scripts/tests/collection_from_items_test.py index 5d4c2e416..e2395d0e6 100644 --- a/scripts/tests/collection_from_items_test.py +++ b/scripts/tests/collection_from_items_test.py @@ -18,13 +18,15 @@ from scripts.stac.imagery.collection import ImageryCollection from scripts.stac.imagery.item import ImageryItem from scripts.stac.imagery.metadata_constants import CollectionMetadata +from scripts.stac.imagery.tests.generators import any_stac_asset, any_stac_processing +from scripts.tests.datetimes_test import any_epoch_datetime_string @pytest.fixture(name="item", autouse=True) def setup() -> Iterator[ImageryItem]: # Create mocked STAC Item with patch.dict(environ, {"GIT_HASH": "any Git hash", "GIT_VERSION": "any Git version"}): - item = ImageryItem("123", "./scripts/tests/data/empty.tiff", "any GDAL version", utc_now) + item = ImageryItem("123", any_epoch_datetime_string(), any_stac_asset(), any_stac_processing()) geometry = { "type": "Polygon", "coordinates": [[1799667.5, 5815977.0], [1800422.5, 5815977.0], [1800422.5, 5814986.0], [1799667.5, 5814986.0]], diff --git a/scripts/tests/datetimes_test.py b/scripts/tests/datetimes_test.py index ffb8d3b95..c68909251 100644 --- a/scripts/tests/datetimes_test.py +++ b/scripts/tests/datetimes_test.py @@ -64,3 +64,7 @@ def any_datetime_between(start: datetime, end: datetime) -> datetime: """ range_in_seconds = (end - start).total_seconds() return start + timedelta(seconds=randint(0, int(range_in_seconds))) + + +def any_epoch_datetime_string() -> str: + return format_rfc_3339_datetime_string(any_epoch_datetime())