Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Arch finalization proposal #25

Merged
merged 36 commits into from
Nov 8, 2023
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
71ce3e0
adding numpy types to python types conversion for metadata
dchandan Oct 5, 2023
350b4f4
removing collection2enum
dchandan Oct 5, 2023
2a445f0
black
dchandan Oct 5, 2023
2728ce6
extracting pydantic base models to models.py
dchandan Oct 5, 2023
b47d613
removing cmip6 extension code
dchandan Oct 5, 2023
2f5dc39
Breaking CFJsonItem part 1: extracting STAC item creation
dchandan Oct 6, 2023
3f821ce
Breaking CFJsonItem part 2: extracting datacube extension code
dchandan Oct 6, 2023
3c584cc
updating geometry structure
dchandan Oct 12, 2023
b7a7ed9
moving np datatype conversion to a separate function
dchandan Oct 12, 2023
48598ae
modifications to datacube extension helper functions as per Francis's…
dchandan Oct 12, 2023
94eb521
code cleanup
dchandan Oct 12, 2023
a64a226
change how prefix is applied
dchandan Oct 12, 2023
f22c1a2
PR changes
dchandan Oct 13, 2023
efd9230
fixing output media type and roles output for assets
dchandan Oct 17, 2023
3e88591
adding magpie resource link
dchandan Oct 17, 2023
8d66fba
adding collection resource link for Magpie
dchandan Oct 18, 2023
00a968a
posting items fixes
dchandan Oct 19, 2023
2c3b49d
removing function no longer in use
dchandan Oct 19, 2023
6908d55
implemented updating stac collection and items
dchandan Oct 19, 2023
0c959ea
removing need to pass yml file to app on command line
dchandan Oct 19, 2023
73b2773
code cleanup
dchandan Oct 19, 2023
9e919c2
adding __init__ files
dchandan Oct 19, 2023
c62fb80
fix
dchandan Oct 19, 2023
10db128
more fixes
dchandan Oct 19, 2023
25985db
diagnostics
dchandan Oct 23, 2023
6d675bc
removing unused code
dchandan Oct 23, 2023
65bd5bb
refactoring to allow more flexibility
dchandan Oct 23, 2023
f540dbe
fix datacube extension
dchandan Oct 26, 2023
323c945
pr changes
dchandan Oct 27, 2023
0581c61
reverting to old way to read thredds access links
dchandan Oct 27, 2023
37a26e1
adding ability to get single file from THREDDS loader
dchandan Nov 8, 2023
e55591d
making make_cmip6_item_id a staticmethod
dchandan Nov 8, 2023
f1e28db
wrapping call to make STAC item with a try-exepcet block
dchandan Nov 8, 2023
8bb21e1
fixing commit e55591dd0b7f7db6cd4ee7256512d5693d282145
dchandan Nov 8, 2023
3055afc
more fixes to previous commits
dchandan Nov 8, 2023
3f1d284
making tracking_id optional in CMIP6ItemProperties
dchandan Nov 8, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
179 changes: 0 additions & 179 deletions STACpopulator/extensions/cmip6.py

This file was deleted.

20 changes: 16 additions & 4 deletions STACpopulator/input.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from abc import ABC, abstractmethod
from typing import Any, Iterator, MutableMapping, Optional, Tuple

import numpy as np
import requests
import siphon
import xncml
Expand Down Expand Up @@ -37,10 +38,6 @@ def reset(self):
pass






class THREDDSLoader(GenericLoader):
def __init__(self, thredds_catalog_url: str, depth: Optional[int] = None) -> None:
"""Constructor
Expand Down Expand Up @@ -89,6 +86,21 @@ def extract_metadata(self, ds: siphon.catalog.Dataset) -> MutableMapping[str, An
# Convert NcML to CF-compliant dictionary
attrs = xncml.Dataset.from_text(r.content).to_cf_dict()

# Converting numpy datatypes to python standard datatypes
for key, value in attrs["attributes"].items():
if isinstance(value, list):
newlist = []
for item in value:
if issubclass(type(item), np.integer):
newlist.append(int(item))
elif issubclass(type(item), np.floating):
newlist.append(float(item))
else:
newlist.append(item)
attrs["attributes"][key] = newlist
elif isinstance(type(value), np.integer):
attrs["attributes"][key] = int(value)

dchandan marked this conversation as resolved.
Show resolved Hide resolved
attrs["access_urls"] = ds.access_urls

return attrs
Expand Down
87 changes: 87 additions & 0 deletions STACpopulator/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import datetime as dt
from typing import Any, Dict, List, Optional, Union

from pydantic import (
AnyHttpUrl,
AnyUrl,
BaseModel,
Field,
SerializeAsAny,
field_validator,
)
from typing_extensions import TypedDict


class Geometry(TypedDict):
type: str
coordinates: List[List[List[float]]]
dchandan marked this conversation as resolved.
Show resolved Hide resolved


class Asset(BaseModel):
href: AnyHttpUrl
media_type: Optional[str] = None
title: Optional[str] = None
description: Optional[str] = None
roles: Optional[List[str]] = None


class STACItemProperties(BaseModel):
"""Base STAC Item properties data model. In concrete implementations, users would want to define a new
data model that inherits from this base model and extends it with properties tailored to the data they are
ingesting."""

start_datetime: Optional[dt.datetime] = None
end_datetime: Optional[dt.datetime] = None
datetime: Optional[dt.datetime] = None

@field_validator("datetime", mode="before")
@classmethod
def validate_datetime(cls, v: Union[dt.datetime, str], values: Dict[str, Any]) -> dt:
if v == "null":
if not values["start_datetime"] and not values["end_datetime"]:
raise ValueError("start_datetime and end_datetime must be specified when datetime is null")


# class Link(BaseModel):
# """
# https://github.com/radiantearth/stac-spec/blob/v1.0.0/collection-spec/collection-spec.md#link-object
# """

# href: str = Field(..., alias="href", min_length=1)
# rel: str = Field(..., alias="rel", min_length=1)
# type: Optional[str] = None
# title: Optional[str] = None
# # Label extension
# label: Optional[str] = Field(None, alias="label:assets")
# model_config = ConfigDict(use_enum_values=True)

# def resolve(self, base_url: str) -> None:
# """resolve a link to the given base URL"""
# self.href = urljoin(base_url, self.href)


# class PaginationLink(Link):
# """
# https://github.com/radiantearth/stac-api-spec/blob/master/api-spec.md#paging-extension
# """

# rel: Literal["next", "previous"]
# method: Literal["GET", "POST"]
# body: Optional[Dict[Any, Any]] = None
# merge: bool = False


# Links = RootModel[List[Union[PaginationLink, Link]]]
Comment on lines +64 to +93
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if this gets lost after conversion?

At the very least, the converted STAC Item should contain the rel: source with href point at the original NetCDF file and the rel: describes with href pointing at the NCML. In the case of rel: source, it is also very important that title contains the "path" of the file without the THREDDS URL prefix and the "service" path element (dodsC, fileServer, etc.).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I didn't quite understand your point. The access URLs are contained as assets to the item. This is code that David had but it didn't look like it was being used, so I left it there but commented it out.

@huard would you like to pitch in here with a description of what you were doing and if that was complete? Also, how we can include Francis's suggestion?

Copy link
Collaborator

@fmigneault fmigneault Oct 13, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When the STAC Item is created (POST'd to the API), some resources in Magpie are generated under the stac service to manage its permissions.
Since that item refers to a NetCDF under THREDDS, a corresponding resource exists under thredds service to manage its permissions as well.
Using bird-house/birdhouse-deploy#386, the intent is that when a permission is updated on one, it will synchronize with the other, such that viewing the NetCDF metadata from either service is allowed/denied in the same way for a given user or group.

The problem in doing the above is that the URLs of these 2 services are not "compatible".
The generated resources would be nested like so (for simplicity, I omit the nested-directory handling below) :

stac
  collections
    <directory-uuid>
      items
        <another-uuid>
thredds
  <directory-name>
    <netcdf-name>

Since one works with UUIDs and the other with path names, there is no direct way to guess the references only from the URLs. Therefore, the STAC hook will use one of the references in links, namely the rel: source one, to obtain the STAC Asset URL of the original THREDDS NetCDF location. To establish the "link" between the stac and thredds Magpie services, it will use the title value from the rel: source link to establish a resource_display_name in Magpie. That parameter will be readable from Cowbird later on to resolve the stac UUID <-> thredds name resource relationship.

For example, the following catalog reference:
https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/testdata/xclim/cmip6/catalog.html?dataset=birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc
Should have (at least) the following link in the STAC Item

{
  "rel": "source",
  "title": "birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc",
  "href": "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/fileServer/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc",
  "type": "application/x-netcdf"
}

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for that explanation @fmigneault. This makes sense. Let me implement this.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay done. If all looks good then I will complete the merge.



class STACItem(BaseModel):
"""STAC Item data model."""

id: str = Field(..., alias="id", min_length=1)
geometry: Optional[Geometry] = None
bbox: Optional[List[float]] = None
properties: Optional[SerializeAsAny[STACItemProperties]] = None
assets: Dict[str, Asset] = None
stac_extensions: Optional[List[AnyUrl]] = []
collection: Optional[str] = None
datetime: Optional[dt.datetime] = None # Not in the spec, but needed by pystac.Item.
21 changes: 14 additions & 7 deletions STACpopulator/populator_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def __init__(
self._ingest_pipeline = data_loader
self._stac_host = self.validate_host(stac_host)

#self._collection_id = hashlib.md5(self.collection_name.encode("utf-8")).hexdigest()
# self._collection_id = hashlib.md5(self.collection_name.encode("utf-8")).hexdigest()
self._collection_id = self.collection_name
LOGGER.info("Initialization complete")
LOGGER.info(f"Collection {self.collection_name} is assigned id {self._collection_id}")
Expand All @@ -76,6 +76,13 @@ def stac_host(self) -> str:
def collection_id(self) -> str:
return self._collection_id

@property
@abstractmethod
def item_properties_model(self):
"""In derived classes, this property should be defined as a pydantic data model that derives from
models.STACItemProperties."""
pass
dchandan marked this conversation as resolved.
Show resolved Hide resolved

def validate_host(self, stac_host: str) -> str:
if not url_validate(stac_host):
raise ValueError("stac_host URL is not appropriately formatted")
Expand Down Expand Up @@ -115,12 +122,12 @@ def ingest(self) -> None:
for item_name, item_data in self._ingest_pipeline:
LOGGER.info(f"Creating STAC representation for {item_name}")
stac_item = self.create_stac_item(item_name, item_data)
post_stac_item(self.stac_host, self.collection_id, item_name, stac_item)
try:
pass
except Exception:
LOGGER.error(f"Failed adding STAC item {item_name}")
self.handle_ingestion_error("Posting Error", item_name, item_data)
# post_stac_item(self.stac_host, self.collection_id, item_name, stac_item)
# try:
# pass
# except Exception:
# LOGGER.error(f"Failed adding STAC item {item_name}")
# self.handle_ingestion_error("Posting Error", item_name, item_data)
dchandan marked this conversation as resolved.
Show resolved Hide resolved

@abstractmethod
def handle_ingestion_error(self, error: str, item_name: str, item_data: MutableMapping[str, Any]):
Expand Down
Loading
Loading