Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add derived_from links to item TDE-1251 #1043

Merged
merged 16 commits into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions scripts/cli/cli_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,13 @@ class InputParameterError(Exception):

class TileFiles(NamedTuple):
output: str
""" The tile name of the output file that will be created """

inputs: list[str]
""" The list of input files to be used to create the output file """

includeDerived: bool = False
""" Whether the STAC Item should include the derived_from links """


def get_tile_files(source: str) -> list[TileFiles]:
Expand All @@ -34,11 +40,14 @@ def get_tile_files(source: str) -> list[TileFiles]:

Example:
>>> get_tile_files('[{"output": "CE16_5000_1001", "input": ["s3://bucket/SN9457_CE16_10k_0501.tif"]}]')
[TileFiles(output='CE16_5000_1001', inputs=['s3://bucket/SN9457_CE16_10k_0501.tif'])]
[TileFiles(output='CE16_5000_1001', inputs=['s3://bucket/SN9457_CE16_10k_0501.tif'], includeDerived=False)]
"""
try:
source_json: list[TileFiles] = json.loads(
source, object_hook=lambda d: TileFiles(inputs=d["input"], output=d["output"])
source,
object_hook=lambda d: TileFiles(
inputs=d["input"], output=d["output"], includeDerived=d.get("includeDerived", False)
),
)
except (json.decoder.JSONDecodeError, KeyError) as e:
get_log().error(type(e).__name__, error=str(e))
Expand Down
19 changes: 19 additions & 0 deletions scripts/cli/tests/cli_helper_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,29 @@ def test_get_tile_files(subtests: SubTests) -> None:
with subtests.test():
assert expected_input_filenames == source[0].inputs

with subtests.test(msg="Should not include derived by default"):
assert source[0].includeDerived is False

with subtests.test():
assert expected_output_filename_b == source[1].output


def test_get_tile_files_with_include_derived(subtests: SubTests) -> None:
file_source = '[{"output": "tile_name","input": ["file_a.tiff", "file_b.tiff"], "includeDerived": true}]'
expected_output_filename = "tile_name"
expected_input_filenames = ["file_a.tiff", "file_b.tiff"]

source: list[TileFiles] = get_tile_files(file_source)
with subtests.test():
assert expected_output_filename == source[0].output

with subtests.test():
assert expected_input_filenames == source[0].inputs

with subtests.test():
assert source[0].includeDerived is True


def test_parse_list() -> None:
str_list = "Auckland Council; Toitū Te Whenua Land Information New Zealand;Nelson Council;"
list_parsed = parse_list(str_list)
Expand Down
18 changes: 17 additions & 1 deletion scripts/files/file_tiff.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import os
from decimal import Decimal
from enum import Enum
from typing import Annotated, Any
Expand Down Expand Up @@ -30,6 +31,7 @@ def __init__(
self,
paths: list[str],
preset: str | None = None,
include_derived: bool = False,
) -> None:
paths_original = []
for p in paths:
Expand All @@ -39,6 +41,12 @@ def __init__(
paths_original.append(unquote(p))

self._paths_original = paths_original
self._derived_from_paths = None
if include_derived:
# Transform the TIFF paths to JSON path to point to STAC Items,
# assuming the STAC Items are in the same directory as the TIFF files
l0b0 marked this conversation as resolved.
Show resolved Hide resolved
self._derived_from_paths = [f"{os.path.splitext(path)[0]}.json" for path in paths_original]

self._path_standardised = ""
self._errors: list[dict[str, Any]] = []
self._gdalinfo: GdalInfo | None = None
Expand Down Expand Up @@ -150,14 +158,22 @@ def get_errors(self) -> list[dict[str, Any]]:
return self._errors

def get_paths_original(self) -> list[str]:
"""Get the path(es) of the original (non standardised) file.
"""Get the path(s) of the original (non standardised) file.
It can be a list of path if the standardised file is a retiled image.

Returns:
a list of file path
"""
return self._paths_original

def get_derived_from_paths(self) -> list[str] | None:
"""Get the path(s) of the STAC Items associated to the TIFF files from which the final output is derived.

Returns:
a list of STAC Item JSON file paths or None if not derived from other files.
"""
return self._derived_from_paths

def get_path_standardised(self) -> str:
"""Get the path of the standardised file.

Expand Down
2 changes: 1 addition & 1 deletion scripts/files/files_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
class ContentType(str, Enum):
GEOTIFF = "image/tiff; application=geotiff; profile=cloud-optimized"
JSON = "application/json"
# https://www.iana.org/assignments/media-types/application/geo+json
GEOJSON = "application/geo+json"
""" https://www.iana.org/assignments/media-types/application/geo+json"""
JPEG = "image/jpeg"


Expand Down
22 changes: 10 additions & 12 deletions scripts/stac/imagery/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@
SubtypeParameterError,
)
from scripts.stac.imagery.provider import Provider, ProviderRole
from scripts.stac.link import Link, Relation
from scripts.stac.util import checksum
from scripts.stac.util.STAC_VERSION import STAC_VERSION
from scripts.stac.util.media_type import StacMediaType
from scripts.stac.util.stac_extensions import StacExtensions

CAPTURE_AREA_FILE_NAME = "capture-area.geojson"
Expand Down Expand Up @@ -135,22 +137,18 @@ def add_item(self, item: dict[Any, Any]) -> None:
item: STAC Item to add
"""
item_self_link = next((feat for feat in item["links"] if feat["rel"] == "self"), None)
file_checksum = checksum.multihash_as_hex(dict_to_json_bytes(item))
if item_self_link:
self.add_link(href=item_self_link["href"], file_checksum=file_checksum)
self.stac["links"].append(
Link(
path=item_self_link["href"],
rel=Relation.ITEM,
media_type=StacMediaType.JSON,
file_content=dict_to_json_bytes(item),
).stac
)
self.update_temporal_extent(item["properties"]["start_datetime"], item["properties"]["end_datetime"])
self.update_spatial_extent(item["bbox"])

def add_link(self, href: str, file_checksum: str) -> None:
"""Add a `link` to the existing `links` list of the Collection.

Args:
href: path
file_checksum: Optional checksum of file.
"""
link = {"rel": "item", "href": href, "type": "application/json", "file:checksum": file_checksum}
self.stac["links"].append(link)

def add_providers(self, providers: list[Provider]) -> None:
"""Add a list of Providers to the existing list of `providers` of the Collection.

Expand Down
11 changes: 11 additions & 0 deletions scripts/stac/imagery/create_stac.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@

from scripts.datetimes import utc_now
from scripts.files.files_helper import get_file_name_from_path
from scripts.files.fs import read
from scripts.files.geotiff import get_extents
from scripts.gdal.gdal_helper import gdal_info
from scripts.gdal.gdalinfo import GdalInfo
from scripts.stac.imagery.item import ImageryItem
from scripts.stac.link import Link, Relation
from scripts.stac.util.media_type import StacMediaType


def create_item(
Expand All @@ -14,6 +17,7 @@ def create_item(
end_datetime: str,
collection_id: str,
gdalinfo_result: GdalInfo | None = None,
derived_from: list[str] | None = None,
) -> ImageryItem:
"""Create an ImageryItem (STAC) to be linked to a Collection.

Expand All @@ -23,6 +27,7 @@ def create_item(
end_datetime: end date of the survey
collection_id: collection id to link to the Item
gdalinfo_result: result of the gdalinfo command. Defaults to None.
derived_from: list of STAC Items from where this Item is derived. Defaults to None.

Returns:
a STAC Item wrapped in ImageryItem
Expand All @@ -39,5 +44,11 @@ def create_item(
item.update_spatial(geometry, bbox)
item.add_collection(collection_id)

if derived_from is not None:
for derived in derived_from:
item.add_link(
Link(path=derived, rel=Relation.DERIVED_FROM, media_type=StacMediaType.JSON, file_content=read(derived))
)

get_log().info("ImageryItem created", path=file)
return item
14 changes: 7 additions & 7 deletions scripts/stac/imagery/item.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
from scripts.datetimes import format_rfc_3339_datetime_string
from scripts.files import fs
from scripts.files.fs import modified
from scripts.stac.link import Link, Relation
from scripts.stac.util import checksum
from scripts.stac.util.STAC_VERSION import STAC_VERSION
from scripts.stac.util.media_type import StacMediaType
from scripts.stac.util.stac_extensions import StacExtensions


Expand All @@ -22,9 +24,7 @@ def __init__(self, id_: str, file: str, now: Callable[[], datetime]) -> None:
"type": "Feature",
"stac_version": STAC_VERSION,
"id": id_,
"links": [
{"rel": "self", "href": f"./{id_}.json", "type": "application/json"},
],
"links": [Link(path=f"./{id_}.json", rel=Relation.SELF, media_type=StacMediaType.JSON).stac],
"assets": {
"visual": {
"href": os.path.join(".", os.path.basename(file)),
Expand Down Expand Up @@ -68,8 +68,8 @@ def add_collection(self, collection_id: str) -> None:
collection_id: the id of the collection to link
"""
self.stac["collection"] = collection_id
self.add_link(rel="collection")
self.add_link(rel="parent")
self.add_link(Link(path="./collection.json", rel=Relation.COLLECTION, media_type=StacMediaType.JSON))
self.add_link(Link(path="./collection.json", rel=Relation.PARENT, media_type=StacMediaType.JSON))

def add_link(self, rel: str, href: str = "./collection.json", file_type: str = "application/json") -> None:
self.stac["links"].append({"rel": rel, "href": href, "type": file_type})
def add_link(self, link: Link) -> None:
self.stac["links"].append(link.stac)
24 changes: 24 additions & 0 deletions scripts/stac/imagery/tests/create_stac_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from pathlib import Path
from typing import cast

from scripts.gdal.gdalinfo import GdalInfo
from scripts.stac.imagery.create_stac import create_item


def test_create_item_with_derived_from(tmp_path: Path) -> None:
derived_from_path = tmp_path / "derived_from_item.json"
derived_from_path.write_text('{"type": "Feature", "id": "fake_item"}')
fake_gdal_info: GdalInfo = cast(
GdalInfo, {"wgs84Extent": {"type": "Polygon", "coordinates": [[[0, 1], [1, 1], [1, 0], [0, 0]]]}}
)

item = create_item(
"./scripts/tests/data/empty.tiff", "2024-01-01", "2024-01-02", "abc123", fake_gdal_info, [derived_from_path.as_posix()]
)

assert {
"href": derived_from_path.as_posix(),
"rel": "derived_from",
"type": "application/json",
"file:checksum": "12208010297a79dc2605d99cde3d1ca63f72647637529ef6eb3d57eef1c951dcf939",
} in item.stac["links"]
41 changes: 41 additions & 0 deletions scripts/stac/link.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from enum import Enum

from scripts.stac.util import checksum
from scripts.stac.util.media_type import StacMediaType


class Relation(str, Enum):
"""https://github.com/radiantearth/stac-spec/blob/master/commons/links.md#hierarchical-relations"""

SELF = "self"
ROOT = "root"
PARENT = "parent"
COLLECTION = "collection"
ITEM = "item"
DERIVED_FROM = "derived_from"
""" https://github.com/radiantearth/stac-spec/blob/master/best-practices.md#derived-from-relation-derived_from"""


# pylint: disable=too-few-public-methods
class Link:
"""Represents a STAC Link Object (https://github.com/radiantearth/stac-spec/blob/master/commons/links.md#link-object).

Attributes:
path: A string that represents the actual link in the format of an URL.
rel: A string that represents the relationship that the link has to the object it will be added to.
media_type: `StacMediaType` of the link file.
file_content: Optional. The content of the file that will be used to store the checksum in `file:checksum`.
It assumes using the STAC `file` extension.
"""

stac: dict[str, str]

def __init__(self, path: str, rel: str, media_type: StacMediaType, file_content: bytes | None = None) -> None:
self.stac = {
"href": path,
"rel": rel,
"type": media_type,
}

if file_content:
self.stac["file:checksum"] = checksum.multihash_as_hex(file_content)
13 changes: 13 additions & 0 deletions scripts/stac/util/media_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from enum import Enum


class StacMediaType(str, Enum):
"""https://github.com/radiantearth/stac-spec/blob/master/commons/links.md#stac-media-types"""

JSON = "application/json"
""" For STAC Catalog and Collection """
GEOJSON = "application/geo+json"
""" https://www.iana.org/assignments/media-types/application/geo+json

For STAC Item
"""
7 changes: 6 additions & 1 deletion scripts/standardise_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,12 @@ def main() -> None:

# Create STAC and save in target
item = create_item(
file.get_path_standardised(), start_datetime, end_datetime, arguments.collection_id, file.get_gdalinfo()
file.get_path_standardised(),
start_datetime,
end_datetime,
arguments.collection_id,
file.get_gdalinfo(),
file.get_derived_from_paths(),
)
write(stac_item_path, dict_to_json_bytes(item.stac), content_type=ContentType.GEOJSON.value)
get_log().info("stac_saved", path=stac_item_path)
Expand Down
2 changes: 1 addition & 1 deletion scripts/standardising.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def standardising(
footprint_file_name = files.output + SUFFIX_FOOTPRINT
standardized_file_path = os.path.join(target_output, standardized_file_name)
footprint_file_path = os.path.join(target_output, footprint_file_name)
tiff = FileTiff(files.inputs, preset)
tiff = FileTiff(files.inputs, preset, files.includeDerived)
tiff.set_path_standardised(standardized_file_path)

# Already proccessed can skip processing
Expand Down
Loading