Skip to content

Commit

Permalink
Merge pull request #380 from gaiaresources/BDRSPS-1144-resusable-sites
Browse files Browse the repository at this point in the history
BDRSPS-1144 Re-usable Sites schema and validation changes
  • Loading branch information
chungvl authored Jan 13, 2025
2 parents 8e8089b + 3232e0a commit 107a014
Show file tree
Hide file tree
Showing 37 changed files with 1,530 additions and 240 deletions.
1 change: 1 addition & 0 deletions abis_mapping/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Exports sub-packages interface."""

# Local
from . import identifier
from . import metadata
from . import schema
from . import spatial
Expand Down
68 changes: 68 additions & 0 deletions abis_mapping/models/identifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""Provides models related to "identifiers" in the template data."""

# Standard library
import dataclasses

# Third-party
import frictionless

# Typing
from typing import Self


@dataclasses.dataclass(eq=True, frozen=True, kw_only=True)
class SiteIdentifier:
"""A class to represent how a Site is identified in a row from a template.
This is effectively either the existingBDRSiteIRI field,
or a combination of the siteID and siteIDSource fields.
These are the two ways Sites can be identified in a template.
"""

site_id: str | None
site_id_source: str | None
existing_bdr_site_iri: str | None

@classmethod
def from_row(cls, row: frictionless.Row) -> Self | None:
"""Given a row in a template, return a SiteIdentifier for the site-id related fields.
Args:
row: The row of data.
Returns:
The SiteIdentifier for the siteID-related fields.
None when the siteID-related fields are not in the row.
"""
# "existingBDRSiteIRI" is considered a higher "source of truth",
# if a row has that, only use that as the identifier.
# This means that two sets of identifier fields will compare equal if their
# existingBDRSiteIRI matches, even if the others fields do not match.
existing_bdr_site_iri: str | None = row["existingBDRSiteIRI"]
if existing_bdr_site_iri:
return cls(
site_id=None,
site_id_source=None,
existing_bdr_site_iri=existing_bdr_site_iri,
)

# Otherwise try to use siteID and siteIDSource.
site_id: str | None = row["siteID"]
site_id_source: str | None = row["siteIDSource"]
if site_id and site_id_source:
return cls(
site_id=site_id,
site_id_source=site_id_source,
existing_bdr_site_iri=None,
)

# Otherwise return None.
return None

def __format__(self, format_spec: str) -> str:
"""Format the SiteIdentifier how it should be represented in error messages."""
if self.existing_bdr_site_iri:
return f'existingBDRSiteIRI "{self.existing_bdr_site_iri}"'
else:
return f'siteID "{self.site_id}" and siteIDSource "{self.site_id_source}"'
3 changes: 3 additions & 0 deletions abis_mapping/plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,12 @@
from . import mutual_inclusion
from . import related_site_id_part_of_lookup
from . import required
from . import site_id_or_iri_validation
from . import site_identifier_match
from . import sites_geometry
from . import string_customized
from . import survey_id_validation
from . import tabular
from . import timestamp
from . import unique_together
from . import wkt
44 changes: 37 additions & 7 deletions abis_mapping/plugins/default_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,13 @@
import frictionless.errors

# Typing
from typing import Iterator
from typing import Callable, Iterator


# TODO remove once SSD v2 removed.
_default_error_template = (
"'{key_field}': '{key_value}' has no default value for field '{value_field}' and no other value provided."
)


@attrs.define(kw_only=True, repr=False)
Expand All @@ -21,12 +27,17 @@ class DefaultLookup(frictionless.Check):
Errors = [frictionless.errors.RowConstraintError]

# Attributes specific to this check
# Name of field used for default map lookup
key_field: str
# Name of field used for lookup value, or a callable to get the lookup value.
key_field: str | Callable[[frictionless.Row], object]
# Name of field which default map value corresponds
value_field: str
# Default map consisting of keys from key_field and values for value_field
default_map: Mapping[object, object]
# error message templates,
# used when key_field doesn't get a value from the row.
no_key_error_template: str = _default_error_template
# used when the default_map doesn't provide a value.
no_default_error_template: str = _default_error_template

def validate_row(self, row: frictionless.Row) -> Iterator[frictionless.Error]:
"""Called to validate given row (on every row)
Expand All @@ -41,15 +52,34 @@ def validate_row(self, row: frictionless.Row) -> Iterator[frictionless.Error]:
if row[self.value_field] is not None:
return

# Get value to lookup default map with
if isinstance(self.key_field, str):
lookup_value = row[self.key_field]
else:
lookup_value = self.key_field(row)

# No lookup value is an error
if lookup_value is None:
yield frictionless.errors.RowConstraintError.from_row(
row=row,
note=self.no_key_error_template.format(
key_value=lookup_value,
key_field=self.key_field,
value_field=self.value_field,
),
)
return

# Determine if default value entry exists
if row[self.key_field] in self.default_map:
if lookup_value in self.default_map:
return

# Yield Error
yield frictionless.errors.RowConstraintError.from_row(
row=row,
note=(
f"'{self.key_field}': '{row[self.key_field]}' has no default value "
f"for field '{self.value_field}' and no other value provided."
note=self.no_default_error_template.format(
key_value=lookup_value,
key_field=self.key_field,
value_field=self.value_field,
),
)
48 changes: 48 additions & 0 deletions abis_mapping/plugins/site_id_or_iri_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""Provides extra frictionless check"""

# Third-Party
import attrs
import frictionless
import frictionless.errors

# Typing
from collections.abc import Iterator


@attrs.define(kw_only=True, repr=False)
class SiteIdentifierCheck(frictionless.Check):
"""Checks if the row has either (siteID + siteIDSource) or existingBDRSiteIRI"""

# Check Attributes
type = "site-identifier"
Errors = [frictionless.errors.RowConstraintError]

# optionally only apply this check when this field has a value
skip_when_missing: str | None = None

def validate_row(self, row: frictionless.Row) -> Iterator[frictionless.Error]:
"""Called to validate the given row (on every row).
Args:
row: The row to check.
Yields:
Any errors found in the row.
"""
if self.skip_when_missing is not None and row[self.skip_when_missing] is None:
return

# Get values
site_id: str | None = row["siteID"]
site_id_source: str | None = row["siteIDSource"]
existing_bdr_site_iri: str | None = row["existingBDRSiteIRI"]

if not ((site_id and site_id_source) or existing_bdr_site_iri):
note = "Either siteID and siteIDSource, or existingBDRSiteIRI must be provided"
if self.skip_when_missing is not None:
note += f", when {self.skip_when_missing} is provided"
note += "."
yield frictionless.errors.RowConstraintError.from_row(
row=row,
note=note,
)
92 changes: 92 additions & 0 deletions abis_mapping/plugins/site_identifier_match.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""Provides extra frictionless check"""

# Third-Party
import attrs
import frictionless
import frictionless.errors

# Local
from abis_mapping import models

# Typing
from collections.abc import Iterator, Mapping


@attrs.define(kw_only=True, repr=False)
class SiteIdentifierMatches(frictionless.Check):
"""Checks if the row's siteVisitID+SiteIdentifier matches another template.
This is used by the survey_occurrence_data template to check that each occurrence
with a siteVisitID, has a SiteIdentifier that matches the SiteIdentifier for that
siteVisitID in the survey_site_data_visit template.
i.e. The 'source of truth' FKs linking an Occurrence to a Site (when there is a Visit) are;
occurrence.siteVisitID --> site_visit.siteVisitID && site_visit.SiteIdentifier --> site.SiteIdentifier
There is also a 'short-cut' FK directly from Occurrence to Site;
occurrence.SiteIdentifier --> site.SiteIdentifier
This Check ensures the 'short-cut' FK agrees with the 'source of truth' ones.
"""

# Check Attributes
type = "site-identifier-matches"
Errors = [frictionless.errors.RowConstraintError, frictionless.errors.ConstraintError]

# Map from siteVisitID to SiteIdentifier, from the other template (typically survey_site_visit_data).
site_visit_id_site_id_map: Mapping[str, models.identifier.SiteIdentifier | None]

def validate_row(self, row: frictionless.Row) -> Iterator[frictionless.Error]:
"""Called to validate the given row (on every row).
Args:
row: The row to check.
Yields:
Any errors found in the row.
"""
# If this template has no siteVisitID, skip the check.
site_visit_id: str | None = row["siteVisitID"]
if not site_visit_id:
return
# If siteVisitID should be compulsory, enforce that with a required constraint or similar.

# If this template has no identifier, skip the check
identifier = models.identifier.SiteIdentifier.from_row(row)
if not identifier:
return
# If the identifier must be provided, enforce that with the SiteIdentifierCheck plugin.

# if siteVisitID not in the map, means it wasn't in the site visit data template,
# that's an error in this template.
if site_visit_id not in self.site_visit_id_site_id_map:
yield frictionless.errors.ConstraintError.from_row(
row=row,
note="siteVisitID must match a siteVisitID in the survey_site_visit_data template",
field_name="siteVisitID",
)
return

expected_site_identifier = self.site_visit_id_site_id_map[site_visit_id]
if not expected_site_identifier:
# The site_visit_data template is missing the site identifier,
# that will be an error in that template, no need to raise an error here.
return

# both templates have SiteIdentifiers, check if they don't match.
if expected_site_identifier != identifier:
if expected_site_identifier.existing_bdr_site_iri:
fields = "existingBDRSiteIRI"
else:
fields = "siteID and siteIDSource"
yield frictionless.errors.RowConstraintError.from_row(
row=row,
note=(
f'{fields} must match their values in the survey_site_visit_data template at the row with siteVisitID "{site_visit_id}".'
),
)
return

# Otherwise identifiers match, no error to raise.
28 changes: 25 additions & 3 deletions abis_mapping/plugins/sites_geometry.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
import frictionless.errors
import attrs

# Local
from abis_mapping import models

# Typing
from collections.abc import Collection
from typing import Iterator


Expand All @@ -18,7 +22,8 @@ class SitesGeometry(frictionless.Check):
Errors = [frictionless.errors.RowConstraintError]

# Occurrences site ids to be passed in from occurrence template.
occurrence_site_ids: set[str] = set()
occurrence_site_ids: Collection[str] | None = None
occurrence_site_identifiers: Collection[models.identifier.SiteIdentifier] | None = None

def validate_row(self, row: frictionless.Row) -> Iterator[frictionless.Error]:
"""Called to validate the given row (on every row).
Expand All @@ -34,10 +39,27 @@ def validate_row(self, row: frictionless.Row) -> Iterator[frictionless.Error]:
long = row["decimalLongitude"] is not None
datum = row["geodeticDatum"] is not None
wkt = row["footprintWKT"] is not None
site_id = row["siteID"] in self.occurrence_site_ids

# Perform check
if (lat and long and datum) or (wkt and datum) or site_id:
if (lat and long and datum) or (wkt and datum):
return

# See if site was used by the occurrence template
if self.occurrence_site_ids is not None:
site_id = row["siteID"]
site_used_by_occurrences = site_id and site_id in self.occurrence_site_ids
elif self.occurrence_site_identifiers is not None:
site_identifier = models.identifier.SiteIdentifier.from_row(row)
site_used_by_occurrences = site_identifier and site_identifier in self.occurrence_site_identifiers
else:
site_used_by_occurrences = False

# If geometry fields are invalid, but the Site is used by Occurrence(s), dont' error.
# This is because if all the Occurrences using the Site, have their own valid location,
# it doesn't matter the location here is missing.
# On the other hand, if any of the Occurrences don't have their own valid location,
# An error will be raised on them when they fail to fall back to this Site's location.
if site_used_by_occurrences:
return

# Create error note
Expand Down
Loading

0 comments on commit 107a014

Please sign in to comment.