From 55e24d94793e1e54da351683176fb0a33b4fbbb6 Mon Sep 17 00:00:00 2001 From: Paul Fouquet Date: Thu, 29 Aug 2024 15:57:19 +1200 Subject: [PATCH 01/16] wip --- scripts/cli/cli_helper.py | 4 +++- scripts/files/file_tiff.py | 14 +++++++++++++- scripts/stac/imagery/create_stac.py | 7 +++++++ scripts/standardise_validate.py | 7 ++++++- scripts/standardising.py | 2 +- scripts/tests/data/national-dem_aws.json | 11 +++++++++++ 6 files changed, 41 insertions(+), 4 deletions(-) create mode 100644 scripts/tests/data/national-dem_aws.json diff --git a/scripts/cli/cli_helper.py b/scripts/cli/cli_helper.py index 5eea4678a..a384bbdbf 100644 --- a/scripts/cli/cli_helper.py +++ b/scripts/cli/cli_helper.py @@ -17,6 +17,7 @@ class InputParameterError(Exception): class TileFiles(NamedTuple): output: str inputs: list[str] + includeDerived: bool = False def get_tile_files(source: str) -> list[TileFiles]: @@ -37,8 +38,9 @@ def get_tile_files(source: str) -> list[TileFiles]: [TileFiles(output='CE16_5000_1001', inputs=['s3://bucket/SN9457_CE16_10k_0501.tif'])] """ try: + # FIXME: `includeDerived` should be optional or will fail if not present source_json: list[TileFiles] = json.loads( - source, object_hook=lambda d: TileFiles(inputs=d["input"], output=d["output"]) + source, object_hook=lambda d: TileFiles(inputs=d["input"], output=d["output"], includeDerived=d["includeDerived"]) ) except (json.decoder.JSONDecodeError, KeyError) as e: get_log().error(type(e).__name__, error=str(e)) diff --git a/scripts/files/file_tiff.py b/scripts/files/file_tiff.py index 45776db5a..77d35afb8 100644 --- a/scripts/files/file_tiff.py +++ b/scripts/files/file_tiff.py @@ -1,4 +1,5 @@ import json +import os from decimal import Decimal from enum import Enum from typing import Annotated, Any @@ -30,6 +31,7 @@ def __init__( self, paths: list[str], preset: str | None = None, + include_derived: bool = False, ) -> None: paths_original = [] for p in paths: @@ -39,6 +41,8 @@ def __init__( paths_original.append(unquote(p)) self._paths_original = paths_original + if include_derived: + self._derived_from = [f"{os.path.splitext(path)[0]}.json" for path in paths_original] self._path_standardised = "" self._errors: list[dict[str, Any]] = [] self._gdalinfo: GdalInfo | None = None @@ -150,7 +154,7 @@ def get_errors(self) -> list[dict[str, Any]]: return self._errors def get_paths_original(self) -> list[str]: - """Get the path(es) of the original (non standardised) file. + """Get the path(s) of the original (non standardised) file. It can be a list of path if the standardised file is a retiled image. Returns: @@ -158,6 +162,14 @@ def get_paths_original(self) -> list[str]: """ return self._paths_original + def get_derived_from(self) -> list[str]: + """Get the path(s) of the STAC Items associated to the original TIFF files. + + Returns: + a list of STAC Item file path + """ + return self._derived_from + def get_path_standardised(self) -> str: """Get the path of the standardised file. diff --git a/scripts/stac/imagery/create_stac.py b/scripts/stac/imagery/create_stac.py index f827e192a..f10614c5a 100644 --- a/scripts/stac/imagery/create_stac.py +++ b/scripts/stac/imagery/create_stac.py @@ -1,3 +1,5 @@ +from typing import List, Optional + from linz_logger import get_log from scripts.datetimes import utc_now @@ -14,6 +16,7 @@ def create_item( end_datetime: str, collection_id: str, gdalinfo_result: GdalInfo | None = None, + derived_from: Optional[List[str]] = [], # FIXME ) -> ImageryItem: """Create an ImageryItem (STAC) to be linked to a Collection. @@ -39,5 +42,9 @@ def create_item( item.update_spatial(geometry, bbox) item.add_collection(collection_id) + for derived in derived_from: + # TODO: add checksum and maybe created datetime and updated datetime + item.add_link(rel="derived_from", href=derived) + get_log().info("ImageryItem created", path=file) return item diff --git a/scripts/standardise_validate.py b/scripts/standardise_validate.py index e0597215f..97c212cc8 100644 --- a/scripts/standardise_validate.py +++ b/scripts/standardise_validate.py @@ -115,7 +115,12 @@ def main() -> None: # Create STAC and save in target item = create_item( - file.get_path_standardised(), start_datetime, end_datetime, arguments.collection_id, file.get_gdalinfo() + file.get_path_standardised(), + start_datetime, + end_datetime, + arguments.collection_id, + file.get_gdalinfo(), + derived_from=file.get_derived_from(), ) write(stac_item_path, dict_to_json_bytes(item.stac), content_type=ContentType.GEOJSON.value) get_log().info("stac_saved", path=stac_item_path) diff --git a/scripts/standardising.py b/scripts/standardising.py index 8e548b142..21b32f1e8 100644 --- a/scripts/standardising.py +++ b/scripts/standardising.py @@ -133,7 +133,7 @@ def standardising( footprint_file_name = files.output + SUFFIX_FOOTPRINT standardized_file_path = os.path.join(target_output, standardized_file_name) footprint_file_path = os.path.join(target_output, footprint_file_name) - tiff = FileTiff(files.inputs, preset) + tiff = FileTiff(files.inputs, preset, files.includeDerived) tiff.set_path_standardised(standardized_file_path) # Already proccessed can skip processing diff --git a/scripts/tests/data/national-dem_aws.json b/scripts/tests/data/national-dem_aws.json new file mode 100644 index 000000000..7070bca22 --- /dev/null +++ b/scripts/tests/data/national-dem_aws.json @@ -0,0 +1,11 @@ +[ + { + "output": "BR20", + "input": [ + "s3://nz-elevation/west-coast/west-coast_2020-2022/dem_1m/2193/BR20_10000_0401.tiff", + "s3://nz-elevation/west-coast/west-coast_2020-2022/dem_1m/2193/BR20_10000_0402.tiff", + "s3://nz-elevation/west-coast/west-coast_2020-2022/dem_1m/2193/BR20_10000_0403.tiff" + ], + "includeDerived": true + } +] From b13056caef0c8d247e39b8cf20bb9e2958fa627f Mon Sep 17 00:00:00 2001 From: Paul Fouquet Date: Fri, 30 Aug 2024 07:48:00 +1200 Subject: [PATCH 02/16] fix: get_tile_files was failing if includeDerived was not present --- scripts/cli/cli_helper.py | 9 +++++++-- scripts/cli/tests/cli_helper_test.py | 19 +++++++++++++++++++ scripts/stac/imagery/create_stac.py | 2 +- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/scripts/cli/cli_helper.py b/scripts/cli/cli_helper.py index a384bbdbf..3fafa4e9a 100644 --- a/scripts/cli/cli_helper.py +++ b/scripts/cli/cli_helper.py @@ -16,8 +16,11 @@ class InputParameterError(Exception): class TileFiles(NamedTuple): output: str + """ The tile name of the output file that will be created """ inputs: list[str] + """ The list of input files to be used to create the output file """ includeDerived: bool = False + """ Whether the STAC Item should include the derived_from links """ def get_tile_files(source: str) -> list[TileFiles]: @@ -38,9 +41,11 @@ def get_tile_files(source: str) -> list[TileFiles]: [TileFiles(output='CE16_5000_1001', inputs=['s3://bucket/SN9457_CE16_10k_0501.tif'])] """ try: - # FIXME: `includeDerived` should be optional or will fail if not present source_json: list[TileFiles] = json.loads( - source, object_hook=lambda d: TileFiles(inputs=d["input"], output=d["output"], includeDerived=d["includeDerived"]) + source, + object_hook=lambda d: TileFiles( + inputs=d["input"], output=d["output"], includeDerived=d.get("includeDerived", False) + ), ) except (json.decoder.JSONDecodeError, KeyError) as e: get_log().error(type(e).__name__, error=str(e)) diff --git a/scripts/cli/tests/cli_helper_test.py b/scripts/cli/tests/cli_helper_test.py index eca338983..1d7a660a1 100644 --- a/scripts/cli/tests/cli_helper_test.py +++ b/scripts/cli/tests/cli_helper_test.py @@ -16,11 +16,30 @@ def test_get_tile_files(subtests: SubTests) -> None: with subtests.test(): assert expected_input_filenames == source[0].inputs + + with subtests.test(): + assert source[0].includeDerived is False with subtests.test(): assert expected_output_filename_b == source[1].output +def test_get_tile_files_with_include_derived(subtests: SubTests) -> None: + file_source = '[{"output": "tile_name","input": ["file_a.tiff", "file_b.tiff"], "includeDerived": true}]' + expected_output_filename = "tile_name" + expected_input_filenames = ["file_a.tiff", "file_b.tiff"] + + source: list[TileFiles] = get_tile_files(file_source) + with subtests.test(): + assert expected_output_filename == source[0].output + + with subtests.test(): + assert expected_input_filenames == source[0].inputs + + with subtests.test(): + assert source[0].includeDerived is True + + def test_parse_list() -> None: str_list = "Auckland Council; Toitū Te Whenua Land Information New Zealand;Nelson Council;" list_parsed = parse_list(str_list) diff --git a/scripts/stac/imagery/create_stac.py b/scripts/stac/imagery/create_stac.py index f10614c5a..13760f736 100644 --- a/scripts/stac/imagery/create_stac.py +++ b/scripts/stac/imagery/create_stac.py @@ -16,7 +16,7 @@ def create_item( end_datetime: str, collection_id: str, gdalinfo_result: GdalInfo | None = None, - derived_from: Optional[List[str]] = [], # FIXME + derived_from: Optional[List[str]] = [], # FIXME ) -> ImageryItem: """Create an ImageryItem (STAC) to be linked to a Collection. From 369f504b03c878b3305bb8de75c107d021dddc01 Mon Sep 17 00:00:00 2001 From: Paul Fouquet Date: Fri, 30 Aug 2024 08:03:16 +1200 Subject: [PATCH 03/16] fix: derived_from list --- scripts/cli/tests/cli_helper_test.py | 2 +- scripts/files/file_tiff.py | 1 + scripts/stac/imagery/create_stac.py | 5 ++--- scripts/standardise_validate.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/cli/tests/cli_helper_test.py b/scripts/cli/tests/cli_helper_test.py index 1d7a660a1..c43a34f0f 100644 --- a/scripts/cli/tests/cli_helper_test.py +++ b/scripts/cli/tests/cli_helper_test.py @@ -16,7 +16,7 @@ def test_get_tile_files(subtests: SubTests) -> None: with subtests.test(): assert expected_input_filenames == source[0].inputs - + with subtests.test(): assert source[0].includeDerived is False diff --git a/scripts/files/file_tiff.py b/scripts/files/file_tiff.py index 77d35afb8..ba685334b 100644 --- a/scripts/files/file_tiff.py +++ b/scripts/files/file_tiff.py @@ -41,6 +41,7 @@ def __init__( paths_original.append(unquote(p)) self._paths_original = paths_original + self._derived_from = [] if include_derived: self._derived_from = [f"{os.path.splitext(path)[0]}.json" for path in paths_original] self._path_standardised = "" diff --git a/scripts/stac/imagery/create_stac.py b/scripts/stac/imagery/create_stac.py index 13760f736..62846f892 100644 --- a/scripts/stac/imagery/create_stac.py +++ b/scripts/stac/imagery/create_stac.py @@ -1,5 +1,3 @@ -from typing import List, Optional - from linz_logger import get_log from scripts.datetimes import utc_now @@ -15,8 +13,8 @@ def create_item( start_datetime: str, end_datetime: str, collection_id: str, + derived_from: list[str], gdalinfo_result: GdalInfo | None = None, - derived_from: Optional[List[str]] = [], # FIXME ) -> ImageryItem: """Create an ImageryItem (STAC) to be linked to a Collection. @@ -25,6 +23,7 @@ def create_item( start_datetime: start date of the survey end_datetime: end date of the survey collection_id: collection id to link to the Item + derived_from: list of STAC Items from where this Item is derived gdalinfo_result: result of the gdalinfo command. Defaults to None. Returns: diff --git a/scripts/standardise_validate.py b/scripts/standardise_validate.py index 97c212cc8..24de59b3c 100644 --- a/scripts/standardise_validate.py +++ b/scripts/standardise_validate.py @@ -119,8 +119,8 @@ def main() -> None: start_datetime, end_datetime, arguments.collection_id, + file.get_derived_from(), file.get_gdalinfo(), - derived_from=file.get_derived_from(), ) write(stac_item_path, dict_to_json_bytes(item.stac), content_type=ContentType.GEOJSON.value) get_log().info("stac_saved", path=stac_item_path) From e56b79ee89f3b50a677d2ccd515e96bb727c8aa0 Mon Sep 17 00:00:00 2001 From: Paul Fouquet Date: Fri, 30 Aug 2024 11:47:33 +1200 Subject: [PATCH 04/16] fix: doctest failing with new includeDerived TileFile attribute --- scripts/cli/cli_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/cli/cli_helper.py b/scripts/cli/cli_helper.py index 3fafa4e9a..50962fa7d 100644 --- a/scripts/cli/cli_helper.py +++ b/scripts/cli/cli_helper.py @@ -38,7 +38,7 @@ def get_tile_files(source: str) -> list[TileFiles]: Example: >>> get_tile_files('[{"output": "CE16_5000_1001", "input": ["s3://bucket/SN9457_CE16_10k_0501.tif"]}]') - [TileFiles(output='CE16_5000_1001', inputs=['s3://bucket/SN9457_CE16_10k_0501.tif'])] + [TileFiles(output='CE16_5000_1001', inputs=['s3://bucket/SN9457_CE16_10k_0501.tif'], includeDerived=False)] """ try: source_json: list[TileFiles] = json.loads( From dd606a7e22cbd6dc7bde8c8eb2636bcd530eedac Mon Sep 17 00:00:00 2001 From: Paul Fouquet Date: Tue, 3 Sep 2024 11:44:59 +1200 Subject: [PATCH 05/16] refactor: use a Link class for STAC Links --- scripts/files/file_tiff.py | 15 +++++---- scripts/files/files_helper.py | 2 +- scripts/stac/imagery/collection.py | 22 ++++++------- scripts/stac/imagery/create_stac.py | 6 ++-- scripts/stac/imagery/item.py | 14 ++++----- scripts/stac/link.py | 40 ++++++++++++++++++++++++ scripts/stac/util/media_type.py | 12 +++++++ scripts/standardise_validate.py | 2 +- scripts/tests/data/national-dem_aws.json | 11 ------- 9 files changed, 84 insertions(+), 40 deletions(-) create mode 100644 scripts/stac/link.py create mode 100644 scripts/stac/util/media_type.py delete mode 100644 scripts/tests/data/national-dem_aws.json diff --git a/scripts/files/file_tiff.py b/scripts/files/file_tiff.py index ba685334b..8040be495 100644 --- a/scripts/files/file_tiff.py +++ b/scripts/files/file_tiff.py @@ -41,9 +41,12 @@ def __init__( paths_original.append(unquote(p)) self._paths_original = paths_original - self._derived_from = [] + self._derived_from_paths = [] if include_derived: - self._derived_from = [f"{os.path.splitext(path)[0]}.json" for path in paths_original] + # Transform the TIFF paths to JSON path to point to STAC Items, + # assuming the STAC Items are in the same directory as the TIFF files + self._derived_from_paths = [f"{os.path.splitext(path)[0]}.json" for path in paths_original] + self._path_standardised = "" self._errors: list[dict[str, Any]] = [] self._gdalinfo: GdalInfo | None = None @@ -163,13 +166,13 @@ def get_paths_original(self) -> list[str]: """ return self._paths_original - def get_derived_from(self) -> list[str]: - """Get the path(s) of the STAC Items associated to the original TIFF files. + def get_derived_from_paths(self) -> list[str]: + """Get the path(s) of the STAC Items associated to the TIFF files from which the final output is derived. Returns: - a list of STAC Item file path + a list of STAC Item JSON file paths """ - return self._derived_from + return self._derived_from_paths def get_path_standardised(self) -> str: """Get the path of the standardised file. diff --git a/scripts/files/files_helper.py b/scripts/files/files_helper.py index 2606414a4..8fd2565ad 100644 --- a/scripts/files/files_helper.py +++ b/scripts/files/files_helper.py @@ -8,8 +8,8 @@ class ContentType(str, Enum): GEOTIFF = "image/tiff; application=geotiff; profile=cloud-optimized" JSON = "application/json" - # https://www.iana.org/assignments/media-types/application/geo+json GEOJSON = "application/geo+json" + """ https://www.iana.org/assignments/media-types/application/geo+json""" JPEG = "image/jpeg" diff --git a/scripts/stac/imagery/collection.py b/scripts/stac/imagery/collection.py index 6aa986894..82b69bdf2 100644 --- a/scripts/stac/imagery/collection.py +++ b/scripts/stac/imagery/collection.py @@ -25,8 +25,10 @@ SubtypeParameterError, ) from scripts.stac.imagery.provider import Provider, ProviderRole +from scripts.stac.link import Link, Relation from scripts.stac.util import checksum from scripts.stac.util.STAC_VERSION import STAC_VERSION +from scripts.stac.util.media_type import StacMediaType from scripts.stac.util.stac_extensions import StacExtensions CAPTURE_AREA_FILE_NAME = "capture-area.geojson" @@ -135,22 +137,18 @@ def add_item(self, item: dict[Any, Any]) -> None: item: STAC Item to add """ item_self_link = next((feat for feat in item["links"] if feat["rel"] == "self"), None) - file_checksum = checksum.multihash_as_hex(dict_to_json_bytes(item)) if item_self_link: - self.add_link(href=item_self_link["href"], file_checksum=file_checksum) + self.stac["links"].append( + Link( + path=item_self_link["href"], + rel=Relation.ITEM, + media_type=StacMediaType.JSON, + file_content=dict_to_json_bytes(item), + ).stac + ) self.update_temporal_extent(item["properties"]["start_datetime"], item["properties"]["end_datetime"]) self.update_spatial_extent(item["bbox"]) - def add_link(self, href: str, file_checksum: str) -> None: - """Add a `link` to the existing `links` list of the Collection. - - Args: - href: path - file_checksum: Optional checksum of file. - """ - link = {"rel": "item", "href": href, "type": "application/json", "file:checksum": file_checksum} - self.stac["links"].append(link) - def add_providers(self, providers: list[Provider]) -> None: """Add a list of Providers to the existing list of `providers` of the Collection. diff --git a/scripts/stac/imagery/create_stac.py b/scripts/stac/imagery/create_stac.py index 62846f892..7656458a0 100644 --- a/scripts/stac/imagery/create_stac.py +++ b/scripts/stac/imagery/create_stac.py @@ -2,10 +2,13 @@ from scripts.datetimes import utc_now from scripts.files.files_helper import get_file_name_from_path +from scripts.files.fs import read from scripts.files.geotiff import get_extents from scripts.gdal.gdal_helper import gdal_info from scripts.gdal.gdalinfo import GdalInfo from scripts.stac.imagery.item import ImageryItem +from scripts.stac.link import Link, Relation +from scripts.stac.util.media_type import StacMediaType def create_item( @@ -42,8 +45,7 @@ def create_item( item.add_collection(collection_id) for derived in derived_from: - # TODO: add checksum and maybe created datetime and updated datetime - item.add_link(rel="derived_from", href=derived) + item.add_link(Link(path=derived, rel=Relation.DERIVED_FROM, media_type=StacMediaType.JSON, file_content=read(derived))) get_log().info("ImageryItem created", path=file) return item diff --git a/scripts/stac/imagery/item.py b/scripts/stac/imagery/item.py index a28144ff3..0e3e78af4 100644 --- a/scripts/stac/imagery/item.py +++ b/scripts/stac/imagery/item.py @@ -6,8 +6,10 @@ from scripts.datetimes import format_rfc_3339_datetime_string from scripts.files import fs from scripts.files.fs import modified +from scripts.stac.link import Link, Relation from scripts.stac.util import checksum from scripts.stac.util.STAC_VERSION import STAC_VERSION +from scripts.stac.util.media_type import StacMediaType from scripts.stac.util.stac_extensions import StacExtensions @@ -22,9 +24,7 @@ def __init__(self, id_: str, file: str, now: Callable[[], datetime]) -> None: "type": "Feature", "stac_version": STAC_VERSION, "id": id_, - "links": [ - {"rel": "self", "href": f"./{id_}.json", "type": "application/json"}, - ], + "links": [Link(path=f"./{id_}.json", rel=Relation.SELF, media_type=StacMediaType.JSON).stac], "assets": { "visual": { "href": os.path.join(".", os.path.basename(file)), @@ -68,8 +68,8 @@ def add_collection(self, collection_id: str) -> None: collection_id: the id of the collection to link """ self.stac["collection"] = collection_id - self.add_link(rel="collection") - self.add_link(rel="parent") + self.add_link(Link(path="./collection.json", rel=Relation.COLLECTION, media_type=StacMediaType.JSON)) + self.add_link(Link(path="./collection.json", rel=Relation.PARENT, media_type=StacMediaType.JSON)) - def add_link(self, rel: str, href: str = "./collection.json", file_type: str = "application/json") -> None: - self.stac["links"].append({"rel": rel, "href": href, "type": file_type}) + def add_link(self, link: Link) -> None: + self.stac["links"].append(link.stac) diff --git a/scripts/stac/link.py b/scripts/stac/link.py new file mode 100644 index 000000000..13e652ce1 --- /dev/null +++ b/scripts/stac/link.py @@ -0,0 +1,40 @@ + +from enum import Enum + +from scripts.stac.util import checksum +from scripts.stac.util.media_type import StacMediaType + + +class Relation(str, Enum): + """ https://github.com/radiantearth/stac-spec/blob/master/commons/links.md#hierarchical-relations """ + SELF = "self" + ROOT = "root" + PARENT = "parent" + COLLECTION = "collection" + ITEM = "item" + DERIVED_FROM = "derived_from" + """ TODO: Explain where `derived_from` comes from. It is not in https://www.iana.org/assignments/link-relations/link-relations.xhtml nor in the stac-spec""" + +class Link: + """Represents a STAC Link Object (https://github.com/radiantearth/stac-spec/blob/master/commons/links.md#link-object). + + Attributes: + path: A string that represents the actual link in the format of an URL. + rel: A string that represents the relationship that the link has to the object it will be added to. + media_type: `StacMediaType` of the link file. + file_content: The content of the file that will be used to store the checksum in `file:checksum`. It assumes using the STAC `file` extension. + """ + stac: dict[str, str] + + def __init__(self, path: str, rel: str, media_type: StacMediaType | None, file_content: bytes | None = None) -> None: + self.stac = { + "href": path, + "rel": rel, + } + if media_type: + self.stac["type"] = media_type + if file_content: + self.stac["file:checksum"] = checksum.multihash_as_hex(file_content) + + + diff --git a/scripts/stac/util/media_type.py b/scripts/stac/util/media_type.py new file mode 100644 index 000000000..604edaea9 --- /dev/null +++ b/scripts/stac/util/media_type.py @@ -0,0 +1,12 @@ +from enum import Enum + + +class StacMediaType(str, Enum): + """ https://github.com/radiantearth/stac-spec/blob/master/commons/links.md#stac-media-types """ + JSON = "application/json" + """ For STAC Catalog and Collection """ + GEOJSON = "application/geo+json" + """ https://www.iana.org/assignments/media-types/application/geo+json + + For STAC Item + """ diff --git a/scripts/standardise_validate.py b/scripts/standardise_validate.py index 24de59b3c..fc7bfb74c 100644 --- a/scripts/standardise_validate.py +++ b/scripts/standardise_validate.py @@ -119,7 +119,7 @@ def main() -> None: start_datetime, end_datetime, arguments.collection_id, - file.get_derived_from(), + file.get_derived_from_paths(), file.get_gdalinfo(), ) write(stac_item_path, dict_to_json_bytes(item.stac), content_type=ContentType.GEOJSON.value) diff --git a/scripts/tests/data/national-dem_aws.json b/scripts/tests/data/national-dem_aws.json deleted file mode 100644 index 7070bca22..000000000 --- a/scripts/tests/data/national-dem_aws.json +++ /dev/null @@ -1,11 +0,0 @@ -[ - { - "output": "BR20", - "input": [ - "s3://nz-elevation/west-coast/west-coast_2020-2022/dem_1m/2193/BR20_10000_0401.tiff", - "s3://nz-elevation/west-coast/west-coast_2020-2022/dem_1m/2193/BR20_10000_0402.tiff", - "s3://nz-elevation/west-coast/west-coast_2020-2022/dem_1m/2193/BR20_10000_0403.tiff" - ], - "includeDerived": true - } -] From f36684965d301552885844e8cb70e284f3d65e20 Mon Sep 17 00:00:00 2001 From: Paul Fouquet Date: Tue, 3 Sep 2024 11:50:47 +1200 Subject: [PATCH 06/16] fix: formatting --- scripts/stac/link.py | 20 +++++++++++--------- scripts/stac/util/media_type.py | 3 ++- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/scripts/stac/link.py b/scripts/stac/link.py index 13e652ce1..acb0302f2 100644 --- a/scripts/stac/link.py +++ b/scripts/stac/link.py @@ -1,4 +1,3 @@ - from enum import Enum from scripts.stac.util import checksum @@ -6,15 +5,19 @@ class Relation(str, Enum): - """ https://github.com/radiantearth/stac-spec/blob/master/commons/links.md#hierarchical-relations """ + """https://github.com/radiantearth/stac-spec/blob/master/commons/links.md#hierarchical-relations""" + SELF = "self" ROOT = "root" PARENT = "parent" COLLECTION = "collection" - ITEM = "item" - DERIVED_FROM = "derived_from" - """ TODO: Explain where `derived_from` comes from. It is not in https://www.iana.org/assignments/link-relations/link-relations.xhtml nor in the stac-spec""" + ITEM = "item" + DERIVED_FROM = "derived_from" + """ TODO: Explain where `derived_from` comes from. + It is not in https://www.iana.org/assignments/link-relations/link-relations.xhtml nor in the stac-spec""" + +# pylint: disable=too-few-public-methods class Link: """Represents a STAC Link Object (https://github.com/radiantearth/stac-spec/blob/master/commons/links.md#link-object). @@ -22,8 +25,10 @@ class Link: path: A string that represents the actual link in the format of an URL. rel: A string that represents the relationship that the link has to the object it will be added to. media_type: `StacMediaType` of the link file. - file_content: The content of the file that will be used to store the checksum in `file:checksum`. It assumes using the STAC `file` extension. + file_content: The content of the file that will be used to store the checksum in `file:checksum`. + It assumes using the STAC `file` extension. """ + stac: dict[str, str] def __init__(self, path: str, rel: str, media_type: StacMediaType | None, file_content: bytes | None = None) -> None: @@ -35,6 +40,3 @@ def __init__(self, path: str, rel: str, media_type: StacMediaType | None, file_c self.stac["type"] = media_type if file_content: self.stac["file:checksum"] = checksum.multihash_as_hex(file_content) - - - diff --git a/scripts/stac/util/media_type.py b/scripts/stac/util/media_type.py index 604edaea9..cd56c21f5 100644 --- a/scripts/stac/util/media_type.py +++ b/scripts/stac/util/media_type.py @@ -2,7 +2,8 @@ class StacMediaType(str, Enum): - """ https://github.com/radiantearth/stac-spec/blob/master/commons/links.md#stac-media-types """ + """https://github.com/radiantearth/stac-spec/blob/master/commons/links.md#stac-media-types""" + JSON = "application/json" """ For STAC Catalog and Collection """ GEOJSON = "application/geo+json" From 3781e7fdd441a9dd0cdcab83c64457e2c08c283d Mon Sep 17 00:00:00 2001 From: Paul Fouquet Date: Tue, 3 Sep 2024 12:00:47 +1200 Subject: [PATCH 07/16] docs: derived_from documentation link --- scripts/stac/link.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/stac/link.py b/scripts/stac/link.py index acb0302f2..6868c55e1 100644 --- a/scripts/stac/link.py +++ b/scripts/stac/link.py @@ -13,8 +13,7 @@ class Relation(str, Enum): COLLECTION = "collection" ITEM = "item" DERIVED_FROM = "derived_from" - """ TODO: Explain where `derived_from` comes from. - It is not in https://www.iana.org/assignments/link-relations/link-relations.xhtml nor in the stac-spec""" + """ https://github.com/radiantearth/stac-spec/blob/master/best-practices.md#derived-from-relation-derived_from""" # pylint: disable=too-few-public-methods From 000e61a4453e30b27a76078cdd47885d1297371f Mon Sep 17 00:00:00 2001 From: Paul Fouquet Date: Tue, 3 Sep 2024 12:10:48 +1200 Subject: [PATCH 08/16] refactor: media_type should not be optional --- scripts/stac/link.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/stac/link.py b/scripts/stac/link.py index 6868c55e1..9c45a29e4 100644 --- a/scripts/stac/link.py +++ b/scripts/stac/link.py @@ -30,7 +30,7 @@ class Link: stac: dict[str, str] - def __init__(self, path: str, rel: str, media_type: StacMediaType | None, file_content: bytes | None = None) -> None: + def __init__(self, path: str, rel: str, media_type: StacMediaType, file_content: bytes | None = None) -> None: self.stac = { "href": path, "rel": rel, From 866fd57e03f9ec4e46947f893fa0162d11bd58e9 Mon Sep 17 00:00:00 2001 From: Paul Fouquet Date: Tue, 3 Sep 2024 12:11:32 +1200 Subject: [PATCH 09/16] docs: file_content is optional --- scripts/stac/link.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/stac/link.py b/scripts/stac/link.py index 9c45a29e4..a191b6aa8 100644 --- a/scripts/stac/link.py +++ b/scripts/stac/link.py @@ -24,7 +24,7 @@ class Link: path: A string that represents the actual link in the format of an URL. rel: A string that represents the relationship that the link has to the object it will be added to. media_type: `StacMediaType` of the link file. - file_content: The content of the file that will be used to store the checksum in `file:checksum`. + file_content: Optional. The content of the file that will be used to store the checksum in `file:checksum`. It assumes using the STAC `file` extension. """ From 22a90adaeee4a0a6636d25238fd3a67d0603b82d Mon Sep 17 00:00:00 2001 From: Paul Fouquet Date: Tue, 3 Sep 2024 12:13:40 +1200 Subject: [PATCH 10/16] refactor: always add media_type --- scripts/stac/link.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/stac/link.py b/scripts/stac/link.py index a191b6aa8..4e34b2689 100644 --- a/scripts/stac/link.py +++ b/scripts/stac/link.py @@ -34,8 +34,8 @@ def __init__(self, path: str, rel: str, media_type: StacMediaType, file_content: self.stac = { "href": path, "rel": rel, + "type": media_type, } - if media_type: - self.stac["type"] = media_type + if file_content: self.stac["file:checksum"] = checksum.multihash_as_hex(file_content) From fd96dce7c46747e3cec71fde14a6f14763876f6c Mon Sep 17 00:00:00 2001 From: Paul Fouquet Date: Tue, 3 Sep 2024 15:33:46 +1200 Subject: [PATCH 11/16] test: should add derived_from links --- scripts/files/file_tiff.py | 6 +++--- scripts/stac/imagery/create_stac.py | 11 +++++++---- scripts/stac/imagery/tests/create_stac_test.py | 11 +++++++++++ scripts/stac/link.py | 2 +- scripts/standardise_validate.py | 2 +- 5 files changed, 23 insertions(+), 9 deletions(-) create mode 100644 scripts/stac/imagery/tests/create_stac_test.py diff --git a/scripts/files/file_tiff.py b/scripts/files/file_tiff.py index 8040be495..8b9c016f1 100644 --- a/scripts/files/file_tiff.py +++ b/scripts/files/file_tiff.py @@ -41,7 +41,7 @@ def __init__( paths_original.append(unquote(p)) self._paths_original = paths_original - self._derived_from_paths = [] + self._derived_from_paths = None if include_derived: # Transform the TIFF paths to JSON path to point to STAC Items, # assuming the STAC Items are in the same directory as the TIFF files @@ -166,11 +166,11 @@ def get_paths_original(self) -> list[str]: """ return self._paths_original - def get_derived_from_paths(self) -> list[str]: + def get_derived_from_paths(self) -> list[str] | None: """Get the path(s) of the STAC Items associated to the TIFF files from which the final output is derived. Returns: - a list of STAC Item JSON file paths + a list of STAC Item JSON file paths or None if not derived from other files. """ return self._derived_from_paths diff --git a/scripts/stac/imagery/create_stac.py b/scripts/stac/imagery/create_stac.py index 7656458a0..497182529 100644 --- a/scripts/stac/imagery/create_stac.py +++ b/scripts/stac/imagery/create_stac.py @@ -16,8 +16,8 @@ def create_item( start_datetime: str, end_datetime: str, collection_id: str, - derived_from: list[str], gdalinfo_result: GdalInfo | None = None, + derived_from: list[str] | None = None, ) -> ImageryItem: """Create an ImageryItem (STAC) to be linked to a Collection. @@ -26,8 +26,8 @@ def create_item( start_datetime: start date of the survey end_datetime: end date of the survey collection_id: collection id to link to the Item - derived_from: list of STAC Items from where this Item is derived gdalinfo_result: result of the gdalinfo command. Defaults to None. + derived_from: list of STAC Items from where this Item is derived. Defaults to None. Returns: a STAC Item wrapped in ImageryItem @@ -44,8 +44,11 @@ def create_item( item.update_spatial(geometry, bbox) item.add_collection(collection_id) - for derived in derived_from: - item.add_link(Link(path=derived, rel=Relation.DERIVED_FROM, media_type=StacMediaType.JSON, file_content=read(derived))) + if derived_from is not None: + for derived in derived_from: + item.add_link( + Link(path=derived, rel=Relation.DERIVED_FROM, media_type=StacMediaType.JSON, file_content=read(derived)) + ) get_log().info("ImageryItem created", path=file) return item diff --git a/scripts/stac/imagery/tests/create_stac_test.py b/scripts/stac/imagery/tests/create_stac_test.py new file mode 100644 index 000000000..0f3fa6012 --- /dev/null +++ b/scripts/stac/imagery/tests/create_stac_test.py @@ -0,0 +1,11 @@ +from scripts.stac.imagery.create_stac import create_item + + +def test_create_item_with_derived_from(tmp_path) -> None: + derived_from_path = tmp_path / "derived_from_item.json" + derived_from_path.write_text('{"type": "Feature", "id": "fake_item"}') + fake_gdal_info = {"wgs84Extent":{"type":"Polygon","coordinates":[[[0,1], [1,1], [1,0], [0,0]]]}} + + item = create_item("./scripts/tests/data/empty.tiff", "2024-01-01", "2024-01-02", "abc123", fake_gdal_info, [f"{derived_from_path}"]) + + assert {"href": f"{derived_from_path}", "rel": "derived_from", "type": "application/json", "file:checksum": "12208010297a79dc2605d99cde3d1ca63f72647637529ef6eb3d57eef1c951dcf939"} in item.stac["links"] diff --git a/scripts/stac/link.py b/scripts/stac/link.py index 4e34b2689..79b63ba2c 100644 --- a/scripts/stac/link.py +++ b/scripts/stac/link.py @@ -36,6 +36,6 @@ def __init__(self, path: str, rel: str, media_type: StacMediaType, file_content: "rel": rel, "type": media_type, } - + if file_content: self.stac["file:checksum"] = checksum.multihash_as_hex(file_content) diff --git a/scripts/standardise_validate.py b/scripts/standardise_validate.py index fc7bfb74c..918776f6c 100644 --- a/scripts/standardise_validate.py +++ b/scripts/standardise_validate.py @@ -119,8 +119,8 @@ def main() -> None: start_datetime, end_datetime, arguments.collection_id, - file.get_derived_from_paths(), file.get_gdalinfo(), + file.get_derived_from_paths(), ) write(stac_item_path, dict_to_json_bytes(item.stac), content_type=ContentType.GEOJSON.value) get_log().info("stac_saved", path=stac_item_path) From 894b8618bb1ede3cb770775b1a699e8d42fc4de0 Mon Sep 17 00:00:00 2001 From: Paul Fouquet Date: Tue, 3 Sep 2024 15:47:05 +1200 Subject: [PATCH 12/16] fix: typing --- .../stac/imagery/tests/create_stac_test.py | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/scripts/stac/imagery/tests/create_stac_test.py b/scripts/stac/imagery/tests/create_stac_test.py index 0f3fa6012..f50917fb5 100644 --- a/scripts/stac/imagery/tests/create_stac_test.py +++ b/scripts/stac/imagery/tests/create_stac_test.py @@ -1,11 +1,19 @@ +from pathlib import Path + from scripts.stac.imagery.create_stac import create_item -def test_create_item_with_derived_from(tmp_path) -> None: +def test_create_item_with_derived_from(tmp_path: Path) -> None: derived_from_path = tmp_path / "derived_from_item.json" derived_from_path.write_text('{"type": "Feature", "id": "fake_item"}') - fake_gdal_info = {"wgs84Extent":{"type":"Polygon","coordinates":[[[0,1], [1,1], [1,0], [0,0]]]}} - - item = create_item("./scripts/tests/data/empty.tiff", "2024-01-01", "2024-01-02", "abc123", fake_gdal_info, [f"{derived_from_path}"]) - - assert {"href": f"{derived_from_path}", "rel": "derived_from", "type": "application/json", "file:checksum": "12208010297a79dc2605d99cde3d1ca63f72647637529ef6eb3d57eef1c951dcf939"} in item.stac["links"] + + item = create_item( + "./scripts/tests/data/empty.tiff", "2024-01-01", "2024-01-02", "abc123", derived_from=[f"{derived_from_path}"] + ) + + assert { + "href": f"{derived_from_path}", + "rel": "derived_from", + "type": "application/json", + "file:checksum": "12208010297a79dc2605d99cde3d1ca63f72647637529ef6eb3d57eef1c951dcf939", + } in item.stac["links"] From 11b59e9bcd783f02886ab04adbc86e5adce3cffb Mon Sep 17 00:00:00 2001 From: Paul Fouquet Date: Tue, 3 Sep 2024 15:53:54 +1200 Subject: [PATCH 13/16] test: fix gdal_info --- scripts/stac/imagery/tests/create_stac_test.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scripts/stac/imagery/tests/create_stac_test.py b/scripts/stac/imagery/tests/create_stac_test.py index f50917fb5..27cec0300 100644 --- a/scripts/stac/imagery/tests/create_stac_test.py +++ b/scripts/stac/imagery/tests/create_stac_test.py @@ -1,14 +1,19 @@ from pathlib import Path +from typing import cast +from scripts.gdal.gdalinfo import GdalInfo from scripts.stac.imagery.create_stac import create_item def test_create_item_with_derived_from(tmp_path: Path) -> None: derived_from_path = tmp_path / "derived_from_item.json" derived_from_path.write_text('{"type": "Feature", "id": "fake_item"}') + fake_gdal_info: GdalInfo = cast( + GdalInfo, {"wgs84Extent": {"type": "Polygon", "coordinates": [[[0, 1], [1, 1], [1, 0], [0, 0]]]}} + ) item = create_item( - "./scripts/tests/data/empty.tiff", "2024-01-01", "2024-01-02", "abc123", derived_from=[f"{derived_from_path}"] + "./scripts/tests/data/empty.tiff", "2024-01-01", "2024-01-02", "abc123", fake_gdal_info, [f"{derived_from_path}"] ) assert { From 7f35315d7e5164ec85c72e5a596ff9a1fc295843 Mon Sep 17 00:00:00 2001 From: paulfouquet <86932794+paulfouquet@users.noreply.github.com> Date: Thu, 5 Sep 2024 10:05:42 +1200 Subject: [PATCH 14/16] refactor: improve readibility Co-authored-by: Victor Engmark --- scripts/cli/cli_helper.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/cli/cli_helper.py b/scripts/cli/cli_helper.py index 50962fa7d..66b808ddf 100644 --- a/scripts/cli/cli_helper.py +++ b/scripts/cli/cli_helper.py @@ -17,8 +17,10 @@ class InputParameterError(Exception): class TileFiles(NamedTuple): output: str """ The tile name of the output file that will be created """ + inputs: list[str] """ The list of input files to be used to create the output file """ + includeDerived: bool = False """ Whether the STAC Item should include the derived_from links """ From e9010345c6c4d76a035bb95dd61f06e89799a5f7 Mon Sep 17 00:00:00 2001 From: paulfouquet <86932794+paulfouquet@users.noreply.github.com> Date: Thu, 5 Sep 2024 10:08:48 +1200 Subject: [PATCH 15/16] test: improve test output Co-authored-by: Victor Engmark --- scripts/cli/tests/cli_helper_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/cli/tests/cli_helper_test.py b/scripts/cli/tests/cli_helper_test.py index c43a34f0f..c2edd815b 100644 --- a/scripts/cli/tests/cli_helper_test.py +++ b/scripts/cli/tests/cli_helper_test.py @@ -17,7 +17,7 @@ def test_get_tile_files(subtests: SubTests) -> None: with subtests.test(): assert expected_input_filenames == source[0].inputs - with subtests.test(): + with subtests.test(msg="Should not include derived by default"): assert source[0].includeDerived is False with subtests.test(): From f17fe9690f9788c2f5814f020737d8bb10d52fc5 Mon Sep 17 00:00:00 2001 From: Paul Fouquet Date: Thu, 5 Sep 2024 10:17:53 +1200 Subject: [PATCH 16/16] test: derived_from_path is a Path --- scripts/stac/imagery/tests/create_stac_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/stac/imagery/tests/create_stac_test.py b/scripts/stac/imagery/tests/create_stac_test.py index 27cec0300..174f32a6b 100644 --- a/scripts/stac/imagery/tests/create_stac_test.py +++ b/scripts/stac/imagery/tests/create_stac_test.py @@ -13,11 +13,11 @@ def test_create_item_with_derived_from(tmp_path: Path) -> None: ) item = create_item( - "./scripts/tests/data/empty.tiff", "2024-01-01", "2024-01-02", "abc123", fake_gdal_info, [f"{derived_from_path}"] + "./scripts/tests/data/empty.tiff", "2024-01-01", "2024-01-02", "abc123", fake_gdal_info, [derived_from_path.as_posix()] ) assert { - "href": f"{derived_from_path}", + "href": derived_from_path.as_posix(), "rel": "derived_from", "type": "application/json", "file:checksum": "12208010297a79dc2605d99cde3d1ca63f72647637529ef6eb3d57eef1c951dcf939",