Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ncattrs function to get nc attributes without the need for siphon. #35

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

<!-- insert list items of new changes here -->

* New function `ncattrs` to get attributes from netCDF files hosted on a THREDDS server.

## [0.2.0](https://github.com/crim-ca/stac-populator/tree/0.2.0) (2023-11-10)


Expand Down
50 changes: 50 additions & 0 deletions STACpopulator/stac_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,56 @@ def collection2literal(collection, property="label"):
return Literal[terms]


def thredds_catalog_attrs(url: str) -> dict:
"""Return attributes from the catalog.xml THREDDS server response."""
import xmltodict
import requests
huard marked this conversation as resolved.
Show resolved Hide resolved

xml = requests.get(url).text

raw = xmltodict.parse(
xml,
process_namespaces=True,
namespaces={
"http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0": None,
"https://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0": None,
},
)
return raw


def ncattrs(url: str) -> dict:
"""Return attributes from a THREDDS netCDF dataset."""
import requests
import xncml
import urllib
huard marked this conversation as resolved.
Show resolved Hide resolved

pr = urllib.parse.urlparse(url)

parts = url.split("/")
nc = parts[-1]

# Get catalog information about available services
catalog = "/".join(parts[:-1]) + "/catalog.xml"
cattrs = thredds_catalog_attrs(catalog)["catalog"]

cid = cattrs["dataset"]["@ID"]

# Get service URLs for the dataset
access_urls = {}
for service in cattrs["service"]["service"]:
access_urls[service["@serviceType"]] = f'{pr.scheme}://{pr.netloc}{service["@base"]}{cid}/{nc}'

# Get dataset attributes
r = requests.get(access_urls["NCML"])
attrs = xncml.Dataset.from_text(r.text).to_cf_dict()
attrs["attributes"] = numpy_to_python_datatypes(attrs["attributes"])

# Include service attributes
attrs["access_urls"] = access_urls
return attrs


def ncattrs_to_geometry(attrs: MutableMapping[str, Any]) -> MutableMapping[str, Any]:
"""Create Polygon geometry from CFMetadata."""
attrs = attrs["groups"]["CFMetadata"]["attributes"]
Expand Down
37 changes: 6 additions & 31 deletions tests/test_standalone_stac_item.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
import json
import pytest
import requests
import os
import tempfile
from urllib.parse import quote

import xncml

from STACpopulator.implementations.CMIP6_UofT.add_CMIP6 import CMIP6ItemProperties, CMIP6populator
from STACpopulator.input import THREDDSLoader
from STACpopulator.models import GeoJSONPolygon
from STACpopulator.stac_utils import STAC_item_from_metadata
from STACpopulator.stac_utils import STAC_item_from_metadata, ncattrs
from pystac.validation import JsonSchemaSTACValidator
from pystac import STACObjectType

CUR_DIR = os.path.dirname(__file__)

Expand All @@ -21,35 +20,11 @@ def quote_none_safe(url):

@pytest.mark.online
def test_standalone_stac_item_thredds_ncml():
thredds_url = "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds"
thredds_path = "birdhouse/testdata/xclim/cmip6"
thredds_nc = "sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc"
thredds_catalog = f"{thredds_url}/catalog/{thredds_path}/catalog.html"
thredds_ds = f"{thredds_path}/{thredds_nc}"
thredds_ncml_url = (
f"{thredds_url}/ncml/{thredds_path}/{thredds_nc}"
f"?catalog={quote_none_safe(thredds_catalog)}&dataset={quote_none_safe(thredds_ds)}"
)

# FIXME: avoid hackish workarounds
data = requests.get(thredds_ncml_url).text
attrs = xncml.Dataset.from_text(data).to_cf_dict()
attrs["access_urls"] = { # FIXME: all following should be automatically added, but they are not!
"HTTPServer": f"{thredds_url}/fileServer/{thredds_path}/{thredds_nc}",
"OPENDAP": f"{thredds_url}/dodsC/{thredds_path}/{thredds_nc}",
"WCS": f"{thredds_url}/wcs/{thredds_path}/{thredds_nc}?service=WCS&version=1.0.0&request=GetCapabilities",
"WMS": f"{thredds_url}/wms/{thredds_path}/{thredds_nc}?service=WMS&version=1.3.0&request=GetCapabilities",
"NetcdfSubset": f"{thredds_url}/ncss/{thredds_path}/{thredds_nc}/dataset.html",
}

url = "https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/birdhouse/testdata/xclim/cmip6/sic_SImon_CCCma-CanESM5_ssp245_r13i1p2f1_2020.nc"
huard marked this conversation as resolved.
Show resolved Hide resolved
attrs = ncattrs(url)
stac_item_id = CMIP6populator.make_cmip6_item_id(attrs["attributes"])
stac_item = STAC_item_from_metadata(stac_item_id, attrs, CMIP6ItemProperties, GeoJSONPolygon)

ref_file = os.path.join(CUR_DIR, "data/stac_item_testdata_xclim_cmip6_ncml.json")
with open(ref_file, mode="r", encoding="utf-8") as ff:
reference = json.load(ff)

assert stac_item.to_dict() == reference
huard marked this conversation as resolved.
Show resolved Hide resolved
assert stac_item.validate()
huard marked this conversation as resolved.
Show resolved Hide resolved


class MockedNoSTACUpload(CMIP6populator):
Expand Down
Loading