diff --git a/scripts/cli/cli_helper.py b/scripts/cli/cli_helper.py index 5eea4678a..66b808ddf 100644 --- a/scripts/cli/cli_helper.py +++ b/scripts/cli/cli_helper.py @@ -16,7 +16,13 @@ class InputParameterError(Exception): class TileFiles(NamedTuple): output: str + """ The tile name of the output file that will be created """ + inputs: list[str] + """ The list of input files to be used to create the output file """ + + includeDerived: bool = False + """ Whether the STAC Item should include the derived_from links """ def get_tile_files(source: str) -> list[TileFiles]: @@ -34,11 +40,14 @@ def get_tile_files(source: str) -> list[TileFiles]: Example: >>> get_tile_files('[{"output": "CE16_5000_1001", "input": ["s3://bucket/SN9457_CE16_10k_0501.tif"]}]') - [TileFiles(output='CE16_5000_1001', inputs=['s3://bucket/SN9457_CE16_10k_0501.tif'])] + [TileFiles(output='CE16_5000_1001', inputs=['s3://bucket/SN9457_CE16_10k_0501.tif'], includeDerived=False)] """ try: source_json: list[TileFiles] = json.loads( - source, object_hook=lambda d: TileFiles(inputs=d["input"], output=d["output"]) + source, + object_hook=lambda d: TileFiles( + inputs=d["input"], output=d["output"], includeDerived=d.get("includeDerived", False) + ), ) except (json.decoder.JSONDecodeError, KeyError) as e: get_log().error(type(e).__name__, error=str(e)) diff --git a/scripts/cli/tests/cli_helper_test.py b/scripts/cli/tests/cli_helper_test.py index eca338983..c2edd815b 100644 --- a/scripts/cli/tests/cli_helper_test.py +++ b/scripts/cli/tests/cli_helper_test.py @@ -17,10 +17,29 @@ def test_get_tile_files(subtests: SubTests) -> None: with subtests.test(): assert expected_input_filenames == source[0].inputs + with subtests.test(msg="Should not include derived by default"): + assert source[0].includeDerived is False + with subtests.test(): assert expected_output_filename_b == source[1].output +def test_get_tile_files_with_include_derived(subtests: SubTests) -> None: + file_source = '[{"output": "tile_name","input": ["file_a.tiff", "file_b.tiff"], "includeDerived": true}]' + expected_output_filename = "tile_name" + expected_input_filenames = ["file_a.tiff", "file_b.tiff"] + + source: list[TileFiles] = get_tile_files(file_source) + with subtests.test(): + assert expected_output_filename == source[0].output + + with subtests.test(): + assert expected_input_filenames == source[0].inputs + + with subtests.test(): + assert source[0].includeDerived is True + + def test_parse_list() -> None: str_list = "Auckland Council; Toitū Te Whenua Land Information New Zealand;Nelson Council;" list_parsed = parse_list(str_list) diff --git a/scripts/files/file_tiff.py b/scripts/files/file_tiff.py index 45776db5a..8b9c016f1 100644 --- a/scripts/files/file_tiff.py +++ b/scripts/files/file_tiff.py @@ -1,4 +1,5 @@ import json +import os from decimal import Decimal from enum import Enum from typing import Annotated, Any @@ -30,6 +31,7 @@ def __init__( self, paths: list[str], preset: str | None = None, + include_derived: bool = False, ) -> None: paths_original = [] for p in paths: @@ -39,6 +41,12 @@ def __init__( paths_original.append(unquote(p)) self._paths_original = paths_original + self._derived_from_paths = None + if include_derived: + # Transform the TIFF paths to JSON path to point to STAC Items, + # assuming the STAC Items are in the same directory as the TIFF files + self._derived_from_paths = [f"{os.path.splitext(path)[0]}.json" for path in paths_original] + self._path_standardised = "" self._errors: list[dict[str, Any]] = [] self._gdalinfo: GdalInfo | None = None @@ -150,7 +158,7 @@ def get_errors(self) -> list[dict[str, Any]]: return self._errors def get_paths_original(self) -> list[str]: - """Get the path(es) of the original (non standardised) file. + """Get the path(s) of the original (non standardised) file. It can be a list of path if the standardised file is a retiled image. Returns: @@ -158,6 +166,14 @@ def get_paths_original(self) -> list[str]: """ return self._paths_original + def get_derived_from_paths(self) -> list[str] | None: + """Get the path(s) of the STAC Items associated to the TIFF files from which the final output is derived. + + Returns: + a list of STAC Item JSON file paths or None if not derived from other files. + """ + return self._derived_from_paths + def get_path_standardised(self) -> str: """Get the path of the standardised file. diff --git a/scripts/files/files_helper.py b/scripts/files/files_helper.py index 2606414a4..8fd2565ad 100644 --- a/scripts/files/files_helper.py +++ b/scripts/files/files_helper.py @@ -8,8 +8,8 @@ class ContentType(str, Enum): GEOTIFF = "image/tiff; application=geotiff; profile=cloud-optimized" JSON = "application/json" - # https://www.iana.org/assignments/media-types/application/geo+json GEOJSON = "application/geo+json" + """ https://www.iana.org/assignments/media-types/application/geo+json""" JPEG = "image/jpeg" diff --git a/scripts/stac/imagery/collection.py b/scripts/stac/imagery/collection.py index 6aa986894..82b69bdf2 100644 --- a/scripts/stac/imagery/collection.py +++ b/scripts/stac/imagery/collection.py @@ -25,8 +25,10 @@ SubtypeParameterError, ) from scripts.stac.imagery.provider import Provider, ProviderRole +from scripts.stac.link import Link, Relation from scripts.stac.util import checksum from scripts.stac.util.STAC_VERSION import STAC_VERSION +from scripts.stac.util.media_type import StacMediaType from scripts.stac.util.stac_extensions import StacExtensions CAPTURE_AREA_FILE_NAME = "capture-area.geojson" @@ -135,22 +137,18 @@ def add_item(self, item: dict[Any, Any]) -> None: item: STAC Item to add """ item_self_link = next((feat for feat in item["links"] if feat["rel"] == "self"), None) - file_checksum = checksum.multihash_as_hex(dict_to_json_bytes(item)) if item_self_link: - self.add_link(href=item_self_link["href"], file_checksum=file_checksum) + self.stac["links"].append( + Link( + path=item_self_link["href"], + rel=Relation.ITEM, + media_type=StacMediaType.JSON, + file_content=dict_to_json_bytes(item), + ).stac + ) self.update_temporal_extent(item["properties"]["start_datetime"], item["properties"]["end_datetime"]) self.update_spatial_extent(item["bbox"]) - def add_link(self, href: str, file_checksum: str) -> None: - """Add a `link` to the existing `links` list of the Collection. - - Args: - href: path - file_checksum: Optional checksum of file. - """ - link = {"rel": "item", "href": href, "type": "application/json", "file:checksum": file_checksum} - self.stac["links"].append(link) - def add_providers(self, providers: list[Provider]) -> None: """Add a list of Providers to the existing list of `providers` of the Collection. diff --git a/scripts/stac/imagery/create_stac.py b/scripts/stac/imagery/create_stac.py index f827e192a..497182529 100644 --- a/scripts/stac/imagery/create_stac.py +++ b/scripts/stac/imagery/create_stac.py @@ -2,10 +2,13 @@ from scripts.datetimes import utc_now from scripts.files.files_helper import get_file_name_from_path +from scripts.files.fs import read from scripts.files.geotiff import get_extents from scripts.gdal.gdal_helper import gdal_info from scripts.gdal.gdalinfo import GdalInfo from scripts.stac.imagery.item import ImageryItem +from scripts.stac.link import Link, Relation +from scripts.stac.util.media_type import StacMediaType def create_item( @@ -14,6 +17,7 @@ def create_item( end_datetime: str, collection_id: str, gdalinfo_result: GdalInfo | None = None, + derived_from: list[str] | None = None, ) -> ImageryItem: """Create an ImageryItem (STAC) to be linked to a Collection. @@ -23,6 +27,7 @@ def create_item( end_datetime: end date of the survey collection_id: collection id to link to the Item gdalinfo_result: result of the gdalinfo command. Defaults to None. + derived_from: list of STAC Items from where this Item is derived. Defaults to None. Returns: a STAC Item wrapped in ImageryItem @@ -39,5 +44,11 @@ def create_item( item.update_spatial(geometry, bbox) item.add_collection(collection_id) + if derived_from is not None: + for derived in derived_from: + item.add_link( + Link(path=derived, rel=Relation.DERIVED_FROM, media_type=StacMediaType.JSON, file_content=read(derived)) + ) + get_log().info("ImageryItem created", path=file) return item diff --git a/scripts/stac/imagery/item.py b/scripts/stac/imagery/item.py index a28144ff3..0e3e78af4 100644 --- a/scripts/stac/imagery/item.py +++ b/scripts/stac/imagery/item.py @@ -6,8 +6,10 @@ from scripts.datetimes import format_rfc_3339_datetime_string from scripts.files import fs from scripts.files.fs import modified +from scripts.stac.link import Link, Relation from scripts.stac.util import checksum from scripts.stac.util.STAC_VERSION import STAC_VERSION +from scripts.stac.util.media_type import StacMediaType from scripts.stac.util.stac_extensions import StacExtensions @@ -22,9 +24,7 @@ def __init__(self, id_: str, file: str, now: Callable[[], datetime]) -> None: "type": "Feature", "stac_version": STAC_VERSION, "id": id_, - "links": [ - {"rel": "self", "href": f"./{id_}.json", "type": "application/json"}, - ], + "links": [Link(path=f"./{id_}.json", rel=Relation.SELF, media_type=StacMediaType.JSON).stac], "assets": { "visual": { "href": os.path.join(".", os.path.basename(file)), @@ -68,8 +68,8 @@ def add_collection(self, collection_id: str) -> None: collection_id: the id of the collection to link """ self.stac["collection"] = collection_id - self.add_link(rel="collection") - self.add_link(rel="parent") + self.add_link(Link(path="./collection.json", rel=Relation.COLLECTION, media_type=StacMediaType.JSON)) + self.add_link(Link(path="./collection.json", rel=Relation.PARENT, media_type=StacMediaType.JSON)) - def add_link(self, rel: str, href: str = "./collection.json", file_type: str = "application/json") -> None: - self.stac["links"].append({"rel": rel, "href": href, "type": file_type}) + def add_link(self, link: Link) -> None: + self.stac["links"].append(link.stac) diff --git a/scripts/stac/imagery/tests/create_stac_test.py b/scripts/stac/imagery/tests/create_stac_test.py new file mode 100644 index 000000000..174f32a6b --- /dev/null +++ b/scripts/stac/imagery/tests/create_stac_test.py @@ -0,0 +1,24 @@ +from pathlib import Path +from typing import cast + +from scripts.gdal.gdalinfo import GdalInfo +from scripts.stac.imagery.create_stac import create_item + + +def test_create_item_with_derived_from(tmp_path: Path) -> None: + derived_from_path = tmp_path / "derived_from_item.json" + derived_from_path.write_text('{"type": "Feature", "id": "fake_item"}') + fake_gdal_info: GdalInfo = cast( + GdalInfo, {"wgs84Extent": {"type": "Polygon", "coordinates": [[[0, 1], [1, 1], [1, 0], [0, 0]]]}} + ) + + item = create_item( + "./scripts/tests/data/empty.tiff", "2024-01-01", "2024-01-02", "abc123", fake_gdal_info, [derived_from_path.as_posix()] + ) + + assert { + "href": derived_from_path.as_posix(), + "rel": "derived_from", + "type": "application/json", + "file:checksum": "12208010297a79dc2605d99cde3d1ca63f72647637529ef6eb3d57eef1c951dcf939", + } in item.stac["links"] diff --git a/scripts/stac/link.py b/scripts/stac/link.py new file mode 100644 index 000000000..79b63ba2c --- /dev/null +++ b/scripts/stac/link.py @@ -0,0 +1,41 @@ +from enum import Enum + +from scripts.stac.util import checksum +from scripts.stac.util.media_type import StacMediaType + + +class Relation(str, Enum): + """https://github.com/radiantearth/stac-spec/blob/master/commons/links.md#hierarchical-relations""" + + SELF = "self" + ROOT = "root" + PARENT = "parent" + COLLECTION = "collection" + ITEM = "item" + DERIVED_FROM = "derived_from" + """ https://github.com/radiantearth/stac-spec/blob/master/best-practices.md#derived-from-relation-derived_from""" + + +# pylint: disable=too-few-public-methods +class Link: + """Represents a STAC Link Object (https://github.com/radiantearth/stac-spec/blob/master/commons/links.md#link-object). + + Attributes: + path: A string that represents the actual link in the format of an URL. + rel: A string that represents the relationship that the link has to the object it will be added to. + media_type: `StacMediaType` of the link file. + file_content: Optional. The content of the file that will be used to store the checksum in `file:checksum`. + It assumes using the STAC `file` extension. + """ + + stac: dict[str, str] + + def __init__(self, path: str, rel: str, media_type: StacMediaType, file_content: bytes | None = None) -> None: + self.stac = { + "href": path, + "rel": rel, + "type": media_type, + } + + if file_content: + self.stac["file:checksum"] = checksum.multihash_as_hex(file_content) diff --git a/scripts/stac/util/media_type.py b/scripts/stac/util/media_type.py new file mode 100644 index 000000000..cd56c21f5 --- /dev/null +++ b/scripts/stac/util/media_type.py @@ -0,0 +1,13 @@ +from enum import Enum + + +class StacMediaType(str, Enum): + """https://github.com/radiantearth/stac-spec/blob/master/commons/links.md#stac-media-types""" + + JSON = "application/json" + """ For STAC Catalog and Collection """ + GEOJSON = "application/geo+json" + """ https://www.iana.org/assignments/media-types/application/geo+json + + For STAC Item + """ diff --git a/scripts/standardise_validate.py b/scripts/standardise_validate.py index e0597215f..918776f6c 100644 --- a/scripts/standardise_validate.py +++ b/scripts/standardise_validate.py @@ -115,7 +115,12 @@ def main() -> None: # Create STAC and save in target item = create_item( - file.get_path_standardised(), start_datetime, end_datetime, arguments.collection_id, file.get_gdalinfo() + file.get_path_standardised(), + start_datetime, + end_datetime, + arguments.collection_id, + file.get_gdalinfo(), + file.get_derived_from_paths(), ) write(stac_item_path, dict_to_json_bytes(item.stac), content_type=ContentType.GEOJSON.value) get_log().info("stac_saved", path=stac_item_path) diff --git a/scripts/standardising.py b/scripts/standardising.py index 8e548b142..21b32f1e8 100644 --- a/scripts/standardising.py +++ b/scripts/standardising.py @@ -133,7 +133,7 @@ def standardising( footprint_file_name = files.output + SUFFIX_FOOTPRINT standardized_file_path = os.path.join(target_output, standardized_file_name) footprint_file_path = os.path.join(target_output, footprint_file_name) - tiff = FileTiff(files.inputs, preset) + tiff = FileTiff(files.inputs, preset, files.includeDerived) tiff.set_path_standardised(standardized_file_path) # Already proccessed can skip processing