diff --git a/abis_mapping/models/__init__.py b/abis_mapping/models/__init__.py index 46898761..025c470e 100644 --- a/abis_mapping/models/__init__.py +++ b/abis_mapping/models/__init__.py @@ -1,6 +1,7 @@ """Exports sub-packages interface.""" # Local +from . import identifier from . import metadata from . import schema from . import spatial diff --git a/abis_mapping/models/identifier.py b/abis_mapping/models/identifier.py new file mode 100644 index 00000000..abfe8f1f --- /dev/null +++ b/abis_mapping/models/identifier.py @@ -0,0 +1,68 @@ +"""Provides models related to "identifiers" in the template data.""" + +# Standard library +import dataclasses + +# Third-party +import frictionless + +# Typing +from typing import Self + + +@dataclasses.dataclass(eq=True, frozen=True, kw_only=True) +class SiteIdentifier: + """A class to represent how a Site is identified in a row from a template. + + This is effectively either the existingBDRSiteIRI field, + or a combination of the siteID and siteIDSource fields. + These are the two ways Sites can be identified in a template. + """ + + site_id: str | None + site_id_source: str | None + existing_bdr_site_iri: str | None + + @classmethod + def from_row(cls, row: frictionless.Row) -> Self | None: + """Given a row in a template, return a SiteIdentifier for the site-id related fields. + + Args: + row: The row of data. + + Returns: + The SiteIdentifier for the siteID-related fields. + None when the siteID-related fields are not in the row. + + """ + # "existingBDRSiteIRI" is considered a higher "source of truth", + # if a row has that, only use that as the identifier. + # This means that two sets of identifier fields will compare equal if their + # existingBDRSiteIRI matches, even if the others fields do not match. + existing_bdr_site_iri: str | None = row["existingBDRSiteIRI"] + if existing_bdr_site_iri: + return cls( + site_id=None, + site_id_source=None, + existing_bdr_site_iri=existing_bdr_site_iri, + ) + + # Otherwise try to use siteID and siteIDSource. + site_id: str | None = row["siteID"] + site_id_source: str | None = row["siteIDSource"] + if site_id and site_id_source: + return cls( + site_id=site_id, + site_id_source=site_id_source, + existing_bdr_site_iri=None, + ) + + # Otherwise return None. + return None + + def __format__(self, format_spec: str) -> str: + """Format the SiteIdentifier how it should be represented in error messages.""" + if self.existing_bdr_site_iri: + return f'existingBDRSiteIRI "{self.existing_bdr_site_iri}"' + else: + return f'siteID "{self.site_id}" and siteIDSource "{self.site_id_source}"' diff --git a/abis_mapping/plugins/__init__.py b/abis_mapping/plugins/__init__.py index e218616d..7301880e 100644 --- a/abis_mapping/plugins/__init__.py +++ b/abis_mapping/plugins/__init__.py @@ -13,9 +13,12 @@ from . import mutual_inclusion from . import related_site_id_part_of_lookup from . import required +from . import site_id_or_iri_validation +from . import site_identifier_match from . import sites_geometry from . import string_customized from . import survey_id_validation from . import tabular from . import timestamp +from . import unique_together from . import wkt diff --git a/abis_mapping/plugins/default_lookup.py b/abis_mapping/plugins/default_lookup.py index daa62d89..14549c66 100644 --- a/abis_mapping/plugins/default_lookup.py +++ b/abis_mapping/plugins/default_lookup.py @@ -9,7 +9,13 @@ import frictionless.errors # Typing -from typing import Iterator +from typing import Callable, Iterator + + +# TODO remove once SSD v2 removed. +_default_error_template = ( + "'{key_field}': '{key_value}' has no default value for field '{value_field}' and no other value provided." +) @attrs.define(kw_only=True, repr=False) @@ -21,12 +27,17 @@ class DefaultLookup(frictionless.Check): Errors = [frictionless.errors.RowConstraintError] # Attributes specific to this check - # Name of field used for default map lookup - key_field: str + # Name of field used for lookup value, or a callable to get the lookup value. + key_field: str | Callable[[frictionless.Row], object] # Name of field which default map value corresponds value_field: str # Default map consisting of keys from key_field and values for value_field default_map: Mapping[object, object] + # error message templates, + # used when key_field doesn't get a value from the row. + no_key_error_template: str = _default_error_template + # used when the default_map doesn't provide a value. + no_default_error_template: str = _default_error_template def validate_row(self, row: frictionless.Row) -> Iterator[frictionless.Error]: """Called to validate given row (on every row) @@ -41,15 +52,34 @@ def validate_row(self, row: frictionless.Row) -> Iterator[frictionless.Error]: if row[self.value_field] is not None: return + # Get value to lookup default map with + if isinstance(self.key_field, str): + lookup_value = row[self.key_field] + else: + lookup_value = self.key_field(row) + + # No lookup value is an error + if lookup_value is None: + yield frictionless.errors.RowConstraintError.from_row( + row=row, + note=self.no_key_error_template.format( + key_value=lookup_value, + key_field=self.key_field, + value_field=self.value_field, + ), + ) + return + # Determine if default value entry exists - if row[self.key_field] in self.default_map: + if lookup_value in self.default_map: return # Yield Error yield frictionless.errors.RowConstraintError.from_row( row=row, - note=( - f"'{self.key_field}': '{row[self.key_field]}' has no default value " - f"for field '{self.value_field}' and no other value provided." + note=self.no_default_error_template.format( + key_value=lookup_value, + key_field=self.key_field, + value_field=self.value_field, ), ) diff --git a/abis_mapping/plugins/site_id_or_iri_validation.py b/abis_mapping/plugins/site_id_or_iri_validation.py new file mode 100644 index 00000000..be224dfa --- /dev/null +++ b/abis_mapping/plugins/site_id_or_iri_validation.py @@ -0,0 +1,48 @@ +"""Provides extra frictionless check""" + +# Third-Party +import attrs +import frictionless +import frictionless.errors + +# Typing +from collections.abc import Iterator + + +@attrs.define(kw_only=True, repr=False) +class SiteIdentifierCheck(frictionless.Check): + """Checks if the row has either (siteID + siteIDSource) or existingBDRSiteIRI""" + + # Check Attributes + type = "site-identifier" + Errors = [frictionless.errors.RowConstraintError] + + # optionally only apply this check when this field has a value + skip_when_missing: str | None = None + + def validate_row(self, row: frictionless.Row) -> Iterator[frictionless.Error]: + """Called to validate the given row (on every row). + + Args: + row: The row to check. + + Yields: + Any errors found in the row. + """ + if self.skip_when_missing is not None and row[self.skip_when_missing] is None: + return + + # Get values + site_id: str | None = row["siteID"] + site_id_source: str | None = row["siteIDSource"] + existing_bdr_site_iri: str | None = row["existingBDRSiteIRI"] + + if not ((site_id and site_id_source) or existing_bdr_site_iri): + note = "Either siteID and siteIDSource, or existingBDRSiteIRI must be provided" + if self.skip_when_missing is not None: + note += f", when {self.skip_when_missing} is provided" + note += "." + yield frictionless.errors.RowConstraintError.from_row( + row=row, + note=note, + ) diff --git a/abis_mapping/plugins/site_identifier_match.py b/abis_mapping/plugins/site_identifier_match.py new file mode 100644 index 00000000..616f1d06 --- /dev/null +++ b/abis_mapping/plugins/site_identifier_match.py @@ -0,0 +1,92 @@ +"""Provides extra frictionless check""" + +# Third-Party +import attrs +import frictionless +import frictionless.errors + +# Local +from abis_mapping import models + +# Typing +from collections.abc import Iterator, Mapping + + +@attrs.define(kw_only=True, repr=False) +class SiteIdentifierMatches(frictionless.Check): + """Checks if the row's siteVisitID+SiteIdentifier matches another template. + + This is used by the survey_occurrence_data template to check that each occurrence + with a siteVisitID, has a SiteIdentifier that matches the SiteIdentifier for that + siteVisitID in the survey_site_data_visit template. + + i.e. The 'source of truth' FKs linking an Occurrence to a Site (when there is a Visit) are; + + occurrence.siteVisitID --> site_visit.siteVisitID && site_visit.SiteIdentifier --> site.SiteIdentifier + + There is also a 'short-cut' FK directly from Occurrence to Site; + + occurrence.SiteIdentifier --> site.SiteIdentifier + + This Check ensures the 'short-cut' FK agrees with the 'source of truth' ones. + """ + + # Check Attributes + type = "site-identifier-matches" + Errors = [frictionless.errors.RowConstraintError, frictionless.errors.ConstraintError] + + # Map from siteVisitID to SiteIdentifier, from the other template (typically survey_site_visit_data). + site_visit_id_site_id_map: Mapping[str, models.identifier.SiteIdentifier | None] + + def validate_row(self, row: frictionless.Row) -> Iterator[frictionless.Error]: + """Called to validate the given row (on every row). + + Args: + row: The row to check. + + Yields: + Any errors found in the row. + """ + # If this template has no siteVisitID, skip the check. + site_visit_id: str | None = row["siteVisitID"] + if not site_visit_id: + return + # If siteVisitID should be compulsory, enforce that with a required constraint or similar. + + # If this template has no identifier, skip the check + identifier = models.identifier.SiteIdentifier.from_row(row) + if not identifier: + return + # If the identifier must be provided, enforce that with the SiteIdentifierCheck plugin. + + # if siteVisitID not in the map, means it wasn't in the site visit data template, + # that's an error in this template. + if site_visit_id not in self.site_visit_id_site_id_map: + yield frictionless.errors.ConstraintError.from_row( + row=row, + note="siteVisitID must match a siteVisitID in the survey_site_visit_data template", + field_name="siteVisitID", + ) + return + + expected_site_identifier = self.site_visit_id_site_id_map[site_visit_id] + if not expected_site_identifier: + # The site_visit_data template is missing the site identifier, + # that will be an error in that template, no need to raise an error here. + return + + # both templates have SiteIdentifiers, check if they don't match. + if expected_site_identifier != identifier: + if expected_site_identifier.existing_bdr_site_iri: + fields = "existingBDRSiteIRI" + else: + fields = "siteID and siteIDSource" + yield frictionless.errors.RowConstraintError.from_row( + row=row, + note=( + f'{fields} must match their values in the survey_site_visit_data template at the row with siteVisitID "{site_visit_id}".' + ), + ) + return + + # Otherwise identifiers match, no error to raise. diff --git a/abis_mapping/plugins/sites_geometry.py b/abis_mapping/plugins/sites_geometry.py index d5233374..09a1a404 100644 --- a/abis_mapping/plugins/sites_geometry.py +++ b/abis_mapping/plugins/sites_geometry.py @@ -5,7 +5,11 @@ import frictionless.errors import attrs +# Local +from abis_mapping import models + # Typing +from collections.abc import Collection from typing import Iterator @@ -18,7 +22,8 @@ class SitesGeometry(frictionless.Check): Errors = [frictionless.errors.RowConstraintError] # Occurrences site ids to be passed in from occurrence template. - occurrence_site_ids: set[str] = set() + occurrence_site_ids: Collection[str] | None = None + occurrence_site_identifiers: Collection[models.identifier.SiteIdentifier] | None = None def validate_row(self, row: frictionless.Row) -> Iterator[frictionless.Error]: """Called to validate the given row (on every row). @@ -34,10 +39,27 @@ def validate_row(self, row: frictionless.Row) -> Iterator[frictionless.Error]: long = row["decimalLongitude"] is not None datum = row["geodeticDatum"] is not None wkt = row["footprintWKT"] is not None - site_id = row["siteID"] in self.occurrence_site_ids # Perform check - if (lat and long and datum) or (wkt and datum) or site_id: + if (lat and long and datum) or (wkt and datum): + return + + # See if site was used by the occurrence template + if self.occurrence_site_ids is not None: + site_id = row["siteID"] + site_used_by_occurrences = site_id and site_id in self.occurrence_site_ids + elif self.occurrence_site_identifiers is not None: + site_identifier = models.identifier.SiteIdentifier.from_row(row) + site_used_by_occurrences = site_identifier and site_identifier in self.occurrence_site_identifiers + else: + site_used_by_occurrences = False + + # If geometry fields are invalid, but the Site is used by Occurrence(s), dont' error. + # This is because if all the Occurrences using the Site, have their own valid location, + # it doesn't matter the location here is missing. + # On the other hand, if any of the Occurrences don't have their own valid location, + # An error will be raised on them when they fail to fall back to this Site's location. + if site_used_by_occurrences: return # Create error note diff --git a/abis_mapping/plugins/unique_together.py b/abis_mapping/plugins/unique_together.py new file mode 100644 index 00000000..76b1f3bd --- /dev/null +++ b/abis_mapping/plugins/unique_together.py @@ -0,0 +1,72 @@ +"""Provides extra frictionless unique together checks for the package""" + +# Third-Party +import attrs +import frictionless +import frictionless.errors + +# Typing +from collections.abc import Iterator, Sequence +from typing import Literal + + +class UniqueTogetherError(frictionless.errors.RowError): + type = "unique-together" + title = "Unique Together Error" + description = "Each row must have a unique combination of values in the unique together fields." + template = 'Row at position "{rowNumber}" violates the unique together constraint: {note}' + + +@attrs.define(kw_only=True, repr=False) +class UniqueTogether(frictionless.Check): + """Checks whether 2 or more columns are unique together within the dataset.""" + + # Check Attributes + type = "unique-together" + Errors = [UniqueTogetherError] + + # Attributes to customize this check + fields: Sequence[str] + null_handling: Literal[ + "skip", # Skip any row where any of the fields is None + # This is like a regular multi-column unique constrain in SQL. + "include", # Include rows with None in the check, treating None as equal to itself. + # This is like a multi-column unique constrain in SQL with the NULLS NOT DISTINCT option. + ] + error_message_template: str = ( + "The unique together fields [{fields}] contain the values [{values}] " + 'that have already been used in the row at position "{first_seen_row_number}"' + ) + + # Private attribute to track the values seen so far, and at which row number + _seen_values: dict[tuple[object, ...], int] = attrs.field(factory=dict, init=False) + + def validate_row(self, row: frictionless.Row) -> Iterator[frictionless.Error]: + """Called to validate the given row (on every row). + + Args: + row: The row to check. + + Yields: + Any errors found in the row. + """ + # Get tuple of values for the fields + values: tuple[object, ...] = tuple(row[key] for key in self.fields) + + # Check if the row should be skipped + if None in values and self.null_handling == "skip": + return + + if (first_seen_row_number := self._seen_values.get(values)) is not None: + # If values already seen, return an error + yield UniqueTogetherError.from_row( + row=row, + note=self.error_message_template.format( + fields=", ".join(self.fields), + values=", ".join(map(str, values)), + first_seen_row_number=first_seen_row_number, + ), + ) + else: + # otherwise add them to the seen values to check following rows. + self._seen_values[values] = row.row_number diff --git a/abis_mapping/templates/survey_occurrence_data_v3/examples/margaret_river_flora/margaret_river_flora.csv b/abis_mapping/templates/survey_occurrence_data_v3/examples/margaret_river_flora/margaret_river_flora.csv index 49ddf57d..92d7c5c3 100644 --- a/abis_mapping/templates/survey_occurrence_data_v3/examples/margaret_river_flora/margaret_river_flora.csv +++ b/abis_mapping/templates/survey_occurrence_data_v3/examples/margaret_river_flora/margaret_river_flora.csv @@ -1,17 +1,17 @@ -providerRecordID,providerRecordIDSource,locality,decimalLatitude,decimalLongitude,geodeticDatum,coordinateUncertaintyInMeters,dataGeneralizations,eventDateStart,eventDateEnd,samplingProtocol,basisOfRecord,recordedBy,recordNumber,occurrenceStatus,habitat,establishmentMeans,organismRemarks,individualCount,organismQuantity,organismQuantityType,lifeStage,sex,reproductiveCondition,ownerRecordID,ownerRecordIDSource,collectionCode,catalogNumber,catalogNumberSource,otherCatalogNumbers,otherCatalogNumbersSource,preparations,preparedDate,associatedSequences,sequencingMethod,verbatimIdentification,dateIdentified,identifiedBy,identificationMethod,scientificName,identificationQualifier,identificationRemarks,acceptedNameUsage,kingdom,taxonRank,threatStatus,conservationAuthority,threatStatusCheckProtocol,threatStatusDateDetermined,threatStatusDeterminedBy,sensitivityCategory,sensitivityAuthority,surveyID,siteID,siteVisitID -1,WAM,Cowaramup Bay Road,-33.8,115.21,WGS84,,,26/09/2019,,,,Stream Environment and Water Pty Ltd,,,,,,,,,,,,,,,,,,,,,,,Calothamnus lateralis var. crassus,,Stream Environment and Water Pty Ltd,,Calothamnus lateralis var. crassus,,,,Plantae,,,,,,,,,,, -2,WAM,Cowaramup Bay Road,-33.86,115.01,WGS84,,,26/09/2019,,,,,PE:12:8831,,,,,,,,,,,,,,,,,,,,,,Boronia anceps,,Stream Environment and Water Pty Ltd,,Boronia anceps,,,,Plantae,,,,,,,,,,, -3,WAM,Cowaramup Bay Road,-33.86,115.01,WGS84,,,26/09/2019,,,,Test Pty Ltd,,,,,,,,,,,,,,,,,,,,,,,Boronia anceps,,Stream Environment and Water Pty Ltd,,Boronia anceps,,,,Plantae,,,,,,,,,,, -4,WAM,Cowaramup Bay Road,-33.86,115.01,WGS84,,,26/09/2019,,,,Stream Environment and Water Pty Ltd,,,,,,,,,,,,,,,,,,,,,,,Boronia anceps,,Stream Environment and Water Pty Ltd,,Boronia anceps,,,,Plantae,,,,,,,,,,, -5,WAM,Cowaramup Bay Road,-33.86,114.99,WGS84,,,26/09/2019,,,,Stream Environment and Water Pty Ltd,,,,,,,,,,,,,,,,,,,,,,,Banksia sessilis var. cordata,,Stream Environment and Water Pty Ltd,,Banksia sessilis var. cordata,,,,Plantae,,,,,,,,,,, -6,WAM,Cowaramup Bay Road,-33.86,114.99,WGS84,,,26/09/2019,,,,Stream Environment and Water Pty Ltd,,,,,,,,,,,,,,,,,,,,,,,Banksia sessilis var. cordata,,Stream Environment and Water Pty Ltd,,Banksia sessilis var. cordata,,,,Plantae,,,,,,,,,,, -7,WAM,Cowaramup Bay Road,-33.86,114.99,WGS84,,,26/09/2019,,,,Stream Environment and Water Pty Ltd,,,,,,,,,,,,,,,,,,,,,,,Banksia sessilis var. cordata,,Stream Environment and Water Pty Ltd,,Banksia sessilis var. cordata,,,,Plantae,,,,,,,,,,, -8,WAM,Cowaramup Bay Road,-33.86,114.99,WGS84,,,26/09/2019,,,,Stream Environment and Water Pty Ltd,,,,,,,,,,,,,,,,,,,,,,,Banksia sessilis var. cordata,,Stream Environment and Water Pty Ltd,,Banksia sessilis var. cordata,,,,Plantae,,,,,,,,,,, -9,WAM,Cowaramup Bay Road,-33.86,115.02,WGS84,,,26/09/2019,,,,Stream Environment and Water Pty Ltd,,,,,,,,,,,,,,,,,,,,,,,Caladenia excelsa,,Stream Environment and Water Pty Ltd,,Caladenia excelsa,,,,Plantae,,,,,,,,,,, -10,WAM,Cowaramup Bay Road,-33.86,115.02,WGS84,,,26/09/2019,,,,Stream Environment and Water Pty Ltd,,,,,,,,,,,,,,,,,,,,,,,Caladenia excelsa,,Stream Environment and Water Pty Ltd,,Caladenia excelsa,,,,Plantae,,,,,,,,,,, -11,WAM,Cowaramup Bay Road,-33.86,115.02,WGS84,,,26/09/2019,,,,Stream Environment and Water Pty Ltd,,,,,,,,,,,,,,,,,,,,,,,Caladenia ?excelsa,,Stream Environment and Water Pty Ltd,,Caladenia excelsa,?,One unopened flower when recorded and one leaf only. ID not confirmed,,Plantae,,,,,,,,,,, -12,WAM,,-33.8,115.21,WGS84,,,26/09/2019,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Caladenia excelsa,,,,Plantae,,,,,,,,,,, -13,WAM,Cowaramup Bay Road,-33.86,115.02,WGS84,,,26/09/2019,,,PreservedSpecimen,Stream Environment and Water Pty Ltd,,,,,,,,,,,,,,C01,CC123,WAM,,,,,,,,,Stream Environment and Water Pty Ltd,,Caladenia excelsa,,,,Plantae,,,,,,,,,,, -14,WAM,Cowaramup Bay Road,-33.86,115.02,WGS84,20,Coordinates rounded to the nearest 10 km for conservation concern,26/09/2019,,,HumanObservation,Stream Environment and Water Pty Ltd,,,,,,,,,,,,,,C01,CC456,WAM,,,,,,,Caladenia ?excelsa,,Stream Environment and Water Pty Ltd,,Caladenia excelsa,?,Could not confirm the ID due to damaged flower,,Plantae,,,,,,,,,,, -8022FSJMJ079c5cf,WAM,Cowaramup Bay Road,-33.8,115.21,WGS84,50,Coordinates rounded to the nearest 10 km for conservation concern,26/09/2019,,human observation,PreservedSpecimen,Stream Environment and Water Pty Ltd,PE:12:8832,present,"Closed forest of Melaleuca lanceolata. White, grey or brown sand, sandy loam.",native,Dried out leaf tips,2,,,adult,male,No breeding evident,MR-456,Stream Environment and Water Pty Ltd,32237,ARACH,WAM,BHP2012-7521 | M12378,BHP,Wet (in ethanol or some other preservative),26/09/2019,https://www.ncbi.nlm.nih.gov/nuccore/MH040669.1 | https://www.ncbi.nlm.nih.gov/nuccore/MH040616.1,Sanger dideoxy sequencing,Caladenia ?excelsa,2019-09-27T12:34+08:00,Stream Environment and Water Pty Ltd,Visually identified in the field (sighting),Caladenia excelsa,species incerta,no flowers present,Caladenia excelsa Hopper & A.P.Br.,Plantae,species,VU,WA,Check against Threatened and Priority Fauna List WA available from https://www.dpaw.wa.gov.au/plants-and-animals/threatened-species-and-communities/threatened-animals. Last updated 13 June 2022,,WA-BIO,Category 1,Department of Biodiversity and Conservation,MR-R1,MR-S1,MR-R1-V1 -ABC123,WAM,Cowaramup Bay Road,-33.8,115.21,WGS84,30,Coordinates generalised,26/09/2019,,new sampling protocol,new basis of record,Stream Environment and Water Pty Ltd,PE:12:8833,new occurrence status,new habitat,new establishment means,Leaves brown,6,,,new life stage,new sex,new reproductiveCondition,MR-457,Stream Environment and Water Pty Ltd,32238,ARACH,WAM,BHP2012-7522 | M12379,BHP,new preparations,27/09/2019,https://www.ncbi.nlm.nih.gov/nuccore/MH040669.1 | https://www.ncbi.nlm.nih.gov/nuccore/MH040616.1,new sequencing method,Caladenia ?excelsa,2019-09-27T12:34+08:00,Stream Environment and Water Pty Ltd,new identification method,Caladenia excelsa,new identification qualifier,new remarks,Caladenia excelsa Hopper & A.P.Br.,new kingdom,new taxon rank,new threat status,WA,a random selection,,,Category 1,Department of Biodiversity and Conservation,MR-R1,MR-S1, +providerRecordID,providerRecordIDSource,locality,decimalLatitude,decimalLongitude,geodeticDatum,coordinateUncertaintyInMeters,dataGeneralizations,eventDateStart,eventDateEnd,samplingProtocol,basisOfRecord,recordedBy,recordNumber,occurrenceStatus,habitat,establishmentMeans,organismRemarks,individualCount,organismQuantity,organismQuantityType,lifeStage,sex,reproductiveCondition,ownerRecordID,ownerRecordIDSource,collectionCode,catalogNumber,catalogNumberSource,otherCatalogNumbers,otherCatalogNumbersSource,preparations,preparedDate,associatedSequences,sequencingMethod,verbatimIdentification,dateIdentified,identifiedBy,identificationMethod,scientificName,identificationQualifier,identificationRemarks,acceptedNameUsage,kingdom,taxonRank,threatStatus,conservationAuthority,threatStatusCheckProtocol,threatStatusDateDetermined,threatStatusDeterminedBy,sensitivityCategory,sensitivityAuthority,surveyID,siteID,siteIDSource,existingBDRSiteIRI,siteVisitID +1,WAM,Cowaramup Bay Road,-33.8,115.21,WGS84,,,26/09/2019,,,,Stream Environment and Water Pty Ltd,,,,,,,,,,,,,,,,,,,,,,,Calothamnus lateralis var. crassus,,Stream Environment and Water Pty Ltd,,Calothamnus lateralis var. crassus,,,,Plantae,,,,,,,,,,,,, +2,WAM,Cowaramup Bay Road,-33.86,115.01,WGS84,,,26/09/2019,,,,,PE:12:8831,,,,,,,,,,,,,,,,,,,,,,Boronia anceps,,Stream Environment and Water Pty Ltd,,Boronia anceps,,,,Plantae,,,,,,,,,,,,, +3,WAM,Cowaramup Bay Road,-33.86,115.01,WGS84,,,26/09/2019,,,,Test Pty Ltd,,,,,,,,,,,,,,,,,,,,,,,Boronia anceps,,Stream Environment and Water Pty Ltd,,Boronia anceps,,,,Plantae,,,,,,,,,,,,, +4,WAM,Cowaramup Bay Road,-33.86,115.01,WGS84,,,26/09/2019,,,,Stream Environment and Water Pty Ltd,,,,,,,,,,,,,,,,,,,,,,,Boronia anceps,,Stream Environment and Water Pty Ltd,,Boronia anceps,,,,Plantae,,,,,,,,,,,,, +5,WAM,Cowaramup Bay Road,-33.86,114.99,WGS84,,,26/09/2019,,,,Stream Environment and Water Pty Ltd,,,,,,,,,,,,,,,,,,,,,,,Banksia sessilis var. cordata,,Stream Environment and Water Pty Ltd,,Banksia sessilis var. cordata,,,,Plantae,,,,,,,,,,,,, +6,WAM,Cowaramup Bay Road,-33.86,114.99,WGS84,,,26/09/2019,,,,Stream Environment and Water Pty Ltd,,,,,,,,,,,,,,,,,,,,,,,Banksia sessilis var. cordata,,Stream Environment and Water Pty Ltd,,Banksia sessilis var. cordata,,,,Plantae,,,,,,,,,,,,, +7,WAM,Cowaramup Bay Road,-33.86,114.99,WGS84,,,26/09/2019,,,,Stream Environment and Water Pty Ltd,,,,,,,,,,,,,,,,,,,,,,,Banksia sessilis var. cordata,,Stream Environment and Water Pty Ltd,,Banksia sessilis var. cordata,,,,Plantae,,,,,,,,,,,,, +8,WAM,Cowaramup Bay Road,-33.86,114.99,WGS84,,,26/09/2019,,,,Stream Environment and Water Pty Ltd,,,,,,,,,,,,,,,,,,,,,,,Banksia sessilis var. cordata,,Stream Environment and Water Pty Ltd,,Banksia sessilis var. cordata,,,,Plantae,,,,,,,,,,,,, +9,WAM,Cowaramup Bay Road,-33.86,115.02,WGS84,,,26/09/2019,,,,Stream Environment and Water Pty Ltd,,,,,,,,,,,,,,,,,,,,,,,Caladenia excelsa,,Stream Environment and Water Pty Ltd,,Caladenia excelsa,,,,Plantae,,,,,,,,,,,,, +10,WAM,Cowaramup Bay Road,-33.86,115.02,WGS84,,,26/09/2019,,,,Stream Environment and Water Pty Ltd,,,,,,,,,,,,,,,,,,,,,,,Caladenia excelsa,,Stream Environment and Water Pty Ltd,,Caladenia excelsa,,,,Plantae,,,,,,,,,,,,, +11,WAM,Cowaramup Bay Road,-33.86,115.02,WGS84,,,26/09/2019,,,,Stream Environment and Water Pty Ltd,,,,,,,,,,,,,,,,,,,,,,,Caladenia ?excelsa,,Stream Environment and Water Pty Ltd,,Caladenia excelsa,?,One unopened flower when recorded and one leaf only. ID not confirmed,,Plantae,,,,,,,,,,,,, +12,WAM,,-33.8,115.21,WGS84,,,26/09/2019,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Caladenia excelsa,,,,Plantae,,,,,,,,,,,,, +13,WAM,Cowaramup Bay Road,-33.86,115.02,WGS84,,,26/09/2019,,,PreservedSpecimen,Stream Environment and Water Pty Ltd,,,,,,,,,,,,,,C01,CC123,WAM,,,,,,,,,Stream Environment and Water Pty Ltd,,Caladenia excelsa,,,,Plantae,,,,,,,,,,,,, +14,WAM,Cowaramup Bay Road,-33.86,115.02,WGS84,20,Coordinates rounded to the nearest 10 km for conservation concern,26/09/2019,,,HumanObservation,Stream Environment and Water Pty Ltd,,,,,,,,,,,,,,C01,CC456,WAM,,,,,,,Caladenia ?excelsa,,Stream Environment and Water Pty Ltd,,Caladenia excelsa,?,Could not confirm the ID due to damaged flower,,Plantae,,,,,,,,,,,,, +8022FSJMJ079c5cf,WAM,Cowaramup Bay Road,-33.8,115.21,WGS84,50,Coordinates rounded to the nearest 10 km for conservation concern,26/09/2019,,human observation,PreservedSpecimen,Stream Environment and Water Pty Ltd,PE:12:8832,present,"Closed forest of Melaleuca lanceolata. White, grey or brown sand, sandy loam.",native,Dried out leaf tips,2,,,adult,male,No breeding evident,MR-456,Stream Environment and Water Pty Ltd,32237,ARACH,WAM,BHP2012-7521 | M12378,BHP,Wet (in ethanol or some other preservative),26/09/2019,https://www.ncbi.nlm.nih.gov/nuccore/MH040669.1 | https://www.ncbi.nlm.nih.gov/nuccore/MH040616.1,Sanger dideoxy sequencing,Caladenia ?excelsa,2019-09-27T12:34+08:00,Stream Environment and Water Pty Ltd,Visually identified in the field (sighting),Caladenia excelsa,species incerta,no flowers present,Caladenia excelsa Hopper & A.P.Br.,Plantae,species,VU,WA,Check against Threatened and Priority Fauna List WA available from https://www.dpaw.wa.gov.au/plants-and-animals/threatened-species-and-communities/threatened-animals. Last updated 13 June 2022,,WA-BIO,Category 1,Department of Biodiversity and Conservation,MR-R1,MR-S1,WAM,,MR-R1-V1 +ABC123,WAM,Cowaramup Bay Road,-33.8,115.21,WGS84,30,Coordinates generalised,26/09/2019,,new sampling protocol,new basis of record,Stream Environment and Water Pty Ltd,PE:12:8833,new occurrence status,new habitat,new establishment means,Leaves brown,6,,,new life stage,new sex,new reproductiveCondition,MR-457,Stream Environment and Water Pty Ltd,32238,ARACH,WAM,BHP2012-7522 | M12379,BHP,new preparations,27/09/2019,https://www.ncbi.nlm.nih.gov/nuccore/MH040669.1 | https://www.ncbi.nlm.nih.gov/nuccore/MH040616.1,new sequencing method,Caladenia ?excelsa,2019-09-27T12:34+08:00,Stream Environment and Water Pty Ltd,new identification method,Caladenia excelsa,new identification qualifier,new remarks,Caladenia excelsa Hopper & A.P.Br.,new kingdom,new taxon rank,new threat status,WA,a random selection,,,Category 1,Department of Biodiversity and Conservation,MR-R1,MR-S1,WAM,, diff --git a/abis_mapping/templates/survey_occurrence_data_v3/examples/organism_qty.csv b/abis_mapping/templates/survey_occurrence_data_v3/examples/organism_qty.csv index a14070f7..1eec6395 100644 --- a/abis_mapping/templates/survey_occurrence_data_v3/examples/organism_qty.csv +++ b/abis_mapping/templates/survey_occurrence_data_v3/examples/organism_qty.csv @@ -1,2 +1,2 @@ -providerRecordID,providerRecordIDSource,locality,decimalLatitude,decimalLongitude,geodeticDatum,coordinateUncertaintyInMeters,dataGeneralizations,eventDateStart,eventDateEnd,samplingProtocol,basisOfRecord,recordedBy,recordNumber,occurrenceStatus,habitat,establishmentMeans,organismRemarks,individualCount,organismQuantity,organismQuantityType,lifeStage,sex,reproductiveCondition,ownerRecordID,ownerRecordIDSource,collectionCode,catalogNumber,catalogNumberSource,otherCatalogNumbers,otherCatalogNumbersSource,preparations,preparedDate,associatedSequences,sequencingMethod,verbatimIdentification,dateIdentified,identifiedBy,identificationMethod,scientificName,identificationQualifier,identificationRemarks,acceptedNameUsage,kingdom,taxonRank,threatStatus,conservationAuthority,threatStatusCheckProtocol,threatStatusDateDetermined,threatStatusDeterminedBy,sensitivityCategory,sensitivityAuthority,surveyID,siteID,siteVisitID -A0010,Gaia Resources,Cowaramup Bay Road,-33.8,115.21,WGS84,,,24/09/2019,,,,,,,,,,,0.05,percentageCoverage,,,,,,,,,,,,,,,,,,,Calothamnus lateralis var. crassus,,,,Plantae,,,,,,,,,,P1, +providerRecordID,providerRecordIDSource,locality,decimalLatitude,decimalLongitude,geodeticDatum,coordinateUncertaintyInMeters,dataGeneralizations,eventDateStart,eventDateEnd,samplingProtocol,basisOfRecord,recordedBy,recordNumber,occurrenceStatus,habitat,establishmentMeans,organismRemarks,individualCount,organismQuantity,organismQuantityType,lifeStage,sex,reproductiveCondition,ownerRecordID,ownerRecordIDSource,collectionCode,catalogNumber,catalogNumberSource,otherCatalogNumbers,otherCatalogNumbersSource,preparations,preparedDate,associatedSequences,sequencingMethod,verbatimIdentification,dateIdentified,identifiedBy,identificationMethod,scientificName,identificationQualifier,identificationRemarks,acceptedNameUsage,kingdom,taxonRank,threatStatus,conservationAuthority,threatStatusCheckProtocol,threatStatusDateDetermined,threatStatusDeterminedBy,sensitivityCategory,sensitivityAuthority,surveyID,siteID,siteIDSource,existingBDRSiteIRI,siteVisitID +A0010,Gaia Resources,Cowaramup Bay Road,-33.8,115.21,WGS84,,,24/09/2019,,,,,,,,,,,0.05,percentageCoverage,,,,,,,,,,,,,,,,,,,Calothamnus lateralis var. crassus,,,,Plantae,,,,,,,,,,P1,TERN,, diff --git a/abis_mapping/templates/survey_occurrence_data_v3/mapping.py b/abis_mapping/templates/survey_occurrence_data_v3/mapping.py index 04d1af5e..5b890a34 100644 --- a/abis_mapping/templates/survey_occurrence_data_v3/mapping.py +++ b/abis_mapping/templates/survey_occurrence_data_v3/mapping.py @@ -81,11 +81,11 @@ def apply_validation(self, data: base.types.ReadableType, **kwargs: Any) -> fric Keyword Args: survey_id_set (Set[str]): Set of surveyIDs from the metadata template. - site_id_geometry_map (dict[str, str]): Default values to use for geometry - for given siteID. + site_id_geometry_map (dict[models.identifier.SiteIdentifier, str]): Default values to use for geometry + for given site identifier. site_visit_id_temporal_map (dict[str, str]): Default RDF (serialized as turtle) to use for temporal entity for given siteVisitID. - site_visit_id_site_id_map (dict[str, str]): Valid site ID for a given site visit ID. + site_visit_id_site_id_map (dict[str, models.identifier.SiteIdentifier | None]): Valid SiteIdentifier for a given site visit ID. Returns: frictionless.Report: Validation report for the specified data. @@ -129,8 +129,11 @@ def apply_validation(self, data: base.types.ReadableType, **kwargs: Any) -> fric plugins.mutual_inclusion.MutuallyInclusive( field_names=["sensitivityCategory", "sensitivityAuthority"], ), - plugins.chained_inclusion.ChainedInclusion( - field_names=["siteVisitID", "siteID"], + plugins.mutual_inclusion.MutuallyInclusive( + field_names=["siteID", "siteIDSource"], + ), + plugins.site_id_or_iri_validation.SiteIdentifierCheck( + skip_when_missing="siteVisitID", ), ], ) @@ -144,12 +147,10 @@ def apply_validation(self, data: base.types.ReadableType, **kwargs: Any) -> fric # Modify checklist in the event site visit id to site id map provided if site_visit_id_site_id_map is not None: - # Add lookup match check + # Add check that siteVisitID->Site in this template agrees with site visit template. checklist.add_check( - plugins.lookup_match.VLookupMatch( - key_field="siteVisitID", - value_field="siteID", - lu_map=site_visit_id_site_id_map, + plugins.site_identifier_match.SiteIdentifierMatches( + site_visit_id_site_id_map=site_visit_id_site_id_map, ) ) @@ -178,9 +179,14 @@ def apply_validation(self, data: base.types.ReadableType, **kwargs: Any) -> fric # Perform a default lookup check based on passed in map. checklist.add_check( plugins.default_lookup.DefaultLookup( - key_field="siteID", + key_field=models.identifier.SiteIdentifier.from_row, value_field="decimalLatitude", default_map=site_id_geometry_map, + no_key_error_template=( + "decimalLatitude, decimalLongitude and geodeticDatum must be provided, " + "or siteID and siteIDSource, or existingBDRSiteIRI, must be provided to use the geometry of a Site." + ), + no_default_error_template="Could not find a Site with {key_value} to use for geometry.", ) ) # Mutual inclusion check to close out the possibility of one missing. @@ -207,14 +213,14 @@ def apply_validation(self, data: base.types.ReadableType, **kwargs: Any) -> fric def extract_site_id_keys( self, data: base.types.ReadableType, - ) -> dict[str, bool]: + ) -> dict[models.identifier.SiteIdentifier, bool]: """Extract site id key values from the data. Args: data (base.types.ReadableType): Raw data to be mapped. Returns: - dict[str, bool]: Keys are the site id values encountered + dict[models.identifier.SiteIdentifier, bool]: Keys are the site id values encountered in the data, values are all 'True', """ # Construct schema @@ -228,10 +234,14 @@ def extract_site_id_keys( encoding="utf-8", ) + result: dict[models.identifier.SiteIdentifier, bool] = {} # Iterate over rows to extract values with resource.open() as r: - # Construct dictionary and return - return {row["siteID"]: True for row in r.row_stream if row["siteID"] is not None} + for row in r.row_stream: + site_identifier = models.identifier.SiteIdentifier.from_row(row) + if site_identifier: + result[site_identifier] = True + return result def extract_site_visit_id_keys( self, diff --git a/abis_mapping/templates/survey_occurrence_data_v3/schema.json b/abis_mapping/templates/survey_occurrence_data_v3/schema.json index 72262a92..9a671882 100644 --- a/abis_mapping/templates/survey_occurrence_data_v3/schema.json +++ b/abis_mapping/templates/survey_occurrence_data_v3/schema.json @@ -739,6 +739,28 @@ "required": false } }, + { + "name": "siteIDSource", + "title": "Site ID Source", + "description": "The organisation that assigned the SiteID to this Site", + "example": "TERN", + "type": "string", + "format": "default", + "constraints": { + "required": false + } + }, + { + "name": "existingBDRSiteIRI", + "title": "Existing BDR Site IRI", + "description": "Corresponds to a unique site IRI, provided within accompanying survey_site_data.csv template.", + "example": "https://linked.data.gov.au/dataset/bdr/site/TERN/P1", + "type": "string", + "format": "uri", + "constraints": { + "required": false + } + }, { "name": "siteVisitID", "title": "Site Visit ID", @@ -760,10 +782,17 @@ } }, { - "fields": "siteID", + "fields": ["siteID", "siteIDSource"], + "reference": { + "resource": "survey_site_data", + "fields": ["siteID", "siteIDSource"] + } + }, + { + "fields": "existingBDRSiteIRI", "reference": { "resource": "survey_site_data", - "fields": "siteID" + "fields": "existingBDRSiteIRI" } }, { diff --git a/abis_mapping/templates/survey_occurrence_data_v3/survey_occurrence_data.csv b/abis_mapping/templates/survey_occurrence_data_v3/survey_occurrence_data.csv index 815a40c6..dd6bf939 100644 --- a/abis_mapping/templates/survey_occurrence_data_v3/survey_occurrence_data.csv +++ b/abis_mapping/templates/survey_occurrence_data_v3/survey_occurrence_data.csv @@ -1 +1 @@ -providerRecordID,providerRecordIDSource,locality,decimalLatitude,decimalLongitude,geodeticDatum,coordinateUncertaintyInMeters,dataGeneralizations,eventDateStart,eventDateEnd,samplingProtocol,basisOfRecord,recordedBy,recordNumber,occurrenceStatus,habitat,establishmentMeans,organismRemarks,individualCount,organismQuantity,organismQuantityType,lifeStage,sex,reproductiveCondition,ownerRecordID,ownerRecordIDSource,collectionCode,catalogNumber,catalogNumberSource,otherCatalogNumbers,otherCatalogNumbersSource,preparations,preparedDate,associatedSequences,sequencingMethod,verbatimIdentification,dateIdentified,identifiedBy,identificationMethod,scientificName,identificationQualifier,identificationRemarks,acceptedNameUsage,kingdom,taxonRank,threatStatus,conservationAuthority,threatStatusCheckProtocol,threatStatusDateDetermined,threatStatusDeterminedBy,sensitivityCategory,sensitivityAuthority,surveyID,siteID,siteVisitID +providerRecordID,providerRecordIDSource,locality,decimalLatitude,decimalLongitude,geodeticDatum,coordinateUncertaintyInMeters,dataGeneralizations,eventDateStart,eventDateEnd,samplingProtocol,basisOfRecord,recordedBy,recordNumber,occurrenceStatus,habitat,establishmentMeans,organismRemarks,individualCount,organismQuantity,organismQuantityType,lifeStage,sex,reproductiveCondition,ownerRecordID,ownerRecordIDSource,collectionCode,catalogNumber,catalogNumberSource,otherCatalogNumbers,otherCatalogNumbersSource,preparations,preparedDate,associatedSequences,sequencingMethod,verbatimIdentification,dateIdentified,identifiedBy,identificationMethod,scientificName,identificationQualifier,identificationRemarks,acceptedNameUsage,kingdom,taxonRank,threatStatus,conservationAuthority,threatStatusCheckProtocol,threatStatusDateDetermined,threatStatusDeterminedBy,sensitivityCategory,sensitivityAuthority,surveyID,siteID,siteIDSource,existingBDRSiteIRI,siteVisitID diff --git a/abis_mapping/templates/survey_occurrence_data_v3/templates/instructions.md b/abis_mapping/templates/survey_occurrence_data_v3/templates/instructions.md index 7c1bae21..2e93004d 100644 --- a/abis_mapping/templates/survey_occurrence_data_v3/templates/instructions.md +++ b/abis_mapping/templates/survey_occurrence_data_v3/templates/instructions.md @@ -101,9 +101,20 @@ Changes from Systematic Survey Occurrence Data Template v2.0.0 ### CHANGED FIELDS +* Add field [`siteIDSource`](#siteIDSource-field). +* Add field [`existingBDRSiteIRI`](#existingBDRSiteIRI-field). + +### CHANGED VALIDATION + * When [`surveyID`](#surveyID-field) is provided, it must have a value that matches a `surveyID` in the Systematic Survey Metadata template to indicate which Survey the Occurrence belongs to. When [`surveyID`](#surveyID-field) is blank, the Occurrence will be treated as incidental. +* [`siteID`](#siteID-field) and the new field [`siteIDSource`](#siteIDSource-field) are conditionally mandatory. +Must be provided together, or neither provided. +* When [`siteVisitID`](#siteVisitID-field) is provided, +either [`siteID`](#siteID-field) and [`siteIDSource`](#siteIDSource-field), +or [`existingBDRSiteIRI`](#existingBDRSiteIRI-field), +or both, must be provided. ## APPENDICES ### APPENDIX-I: Vocabulary List diff --git a/abis_mapping/templates/survey_site_data_v3/examples/minimal-error-duplicate-site-ids.csv b/abis_mapping/templates/survey_site_data_v3/examples/minimal-error-duplicate-site-ids.csv index 91b95810..d47cc227 100644 --- a/abis_mapping/templates/survey_site_data_v3/examples/minimal-error-duplicate-site-ids.csv +++ b/abis_mapping/templates/survey_site_data_v3/examples/minimal-error-duplicate-site-ids.csv @@ -1,3 +1,3 @@ -siteID,siteIDSource,siteType,siteName,siteDescription,habitat,relatedSiteID,relationshipToRelatedSite,locality,decimalLatitude,decimalLongitude,footprintWKT,geodeticDatum,coordinateUncertaintyInMeters,dataGeneralizations -P1,WAM,Plot,Plot 1,Fine woody debris.,,,,Cowaramup Bay Road,-34.036,146.363,"LINESTRING (146.363 -34.036, 146.363 -34.037)",WGS84,50, -P1,WAM,Plot,Plot 1,Fine woody debris.,,,,Cowaramup Bay Road,-34.036,146.363,"LINESTRING (146.363 -34.036, 146.363 -34.037)",WGS84,50, +siteID,siteIDSource,existingBDRSiteIRI,siteType,siteName,siteDescription,habitat,relatedSiteID,relationshipToRelatedSite,locality,decimalLatitude,decimalLongitude,footprintWKT,geodeticDatum,coordinateUncertaintyInMeters,dataGeneralizations +P1,WAM,,Plot,Plot 1,Fine woody debris.,,,,Cowaramup Bay Road,-34.036,146.363,"LINESTRING (146.363 -34.036, 146.363 -34.037)",WGS84,50, +P1,WAM,,Plot,Plot 1,Fine woody debris.,,,,Cowaramup Bay Road,-34.036,146.363,"LINESTRING (146.363 -34.036, 146.363 -34.037)",WGS84,50, diff --git a/abis_mapping/templates/survey_site_data_v3/examples/minimal-error-missing-fields.csv b/abis_mapping/templates/survey_site_data_v3/examples/minimal-error-missing-fields.csv index 211eda4e..30dc8871 100644 --- a/abis_mapping/templates/survey_site_data_v3/examples/minimal-error-missing-fields.csv +++ b/abis_mapping/templates/survey_site_data_v3/examples/minimal-error-missing-fields.csv @@ -1,2 +1,2 @@ -siteID,siteIDSource,siteType,siteName,siteDescription,habitat,relatedSiteID,relationshipToRelatedSite,locality,decimalLatitude,decimalLongitude,footprintWKT,geodeticDatum,coordinateUncertaintyInMeters,dataGeneralizations -P1,WAM,Plot,Plot 1,Fine woody debris.,,,part of,Cowaramup Bay Road,-34.036,146.363,"LINESTRING (146.363 -34.036, 146.363 -34.037)","",50, +siteID,siteIDSource,existingBDRSiteIRI,siteType,siteName,siteDescription,habitat,relatedSiteID,relationshipToRelatedSite,locality,decimalLatitude,decimalLongitude,footprintWKT,geodeticDatum,coordinateUncertaintyInMeters,dataGeneralizations +P1,WAM,,Plot,Plot 1,Fine woody debris.,,,part of,Cowaramup Bay Road,-34.036,146.363,"LINESTRING (146.363 -34.036, 146.363 -34.037)","",50, diff --git a/abis_mapping/templates/survey_site_data_v3/examples/minimal.csv b/abis_mapping/templates/survey_site_data_v3/examples/minimal.csv index 411e8da1..a3d74b3e 100644 --- a/abis_mapping/templates/survey_site_data_v3/examples/minimal.csv +++ b/abis_mapping/templates/survey_site_data_v3/examples/minimal.csv @@ -1,5 +1,5 @@ -siteID,siteIDSource,siteType,siteName,siteDescription,habitat,relatedSiteID,relationshipToRelatedSite,locality,decimalLatitude,decimalLongitude,footprintWKT,geodeticDatum,coordinateUncertaintyInMeters,dataGeneralizations -P0,WAM,Site,ParentSite,Footprint of study area,Closed forest,,,Cowaramup Bay Road,,,"POLYGON ((114.98 -33.85, 115.01 -33.85, 115.01 -33.87, 114.98 -33.87, 114.98 -33.85))",WGS84,50, -P1,WAM,Plot,Plot 1,Fine woody debris.,Closed forest,P0,partOf,Cowaramup Bay Road,-33.85,114.99,"LINESTRING (114.99 -33.85, 115.00 -33.85)",WGS84,50,Coordinates rounded to the nearest 10 km for conservation concern -P2,WAM,Plot,Plot 2,Fine woody debris.,Closed forest,S0,sameAs,Cowaramup Bay Road,-33.85,114.99,"LINESTRING (114.99 -33.85, 115.00 -33.85)",WGS84,50,Coordinates rounded to the nearest 10 km for conservation concern -P3,WAM,Plot,Plot 3,Fine woody debris.,Closed forest,http://example.com/site/S0,sameAs,Cowaramup Bay Road,-33.85,114.99,"LINESTRING (114.99 -33.85, 115.00 -33.85)",WGS84,50,Coordinates rounded to the nearest 10 km for conservation concern +siteID,siteIDSource,existingBDRSiteIRI,siteType,siteName,siteDescription,habitat,relatedSiteID,relationshipToRelatedSite,locality,decimalLatitude,decimalLongitude,footprintWKT,geodeticDatum,coordinateUncertaintyInMeters,dataGeneralizations +P0,WAM,,Site,ParentSite,Footprint of study area,Closed forest,,,Cowaramup Bay Road,,,"POLYGON ((114.98 -33.85, 115.01 -33.85, 115.01 -33.87, 114.98 -33.87, 114.98 -33.85))",WGS84,50, +P1,WAM,,Plot,Plot 1,Fine woody debris.,Closed forest,P0,partOf,Cowaramup Bay Road,-33.85,114.99,"LINESTRING (114.99 -33.85, 115.00 -33.85)",WGS84,50,Coordinates rounded to the nearest 10 km for conservation concern +P2,WAM,,Plot,Plot 2,Fine woody debris.,Closed forest,S0,sameAs,Cowaramup Bay Road,-33.85,114.99,"LINESTRING (114.99 -33.85, 115.00 -33.85)",WGS84,50,Coordinates rounded to the nearest 10 km for conservation concern +P3,WAM,,Plot,Plot 3,Fine woody debris.,Closed forest,http://example.com/site/S0,sameAs,Cowaramup Bay Road,-33.85,114.99,"LINESTRING (114.99 -33.85, 115.00 -33.85)",WGS84,50,Coordinates rounded to the nearest 10 km for conservation concern diff --git a/abis_mapping/templates/survey_site_data_v3/mapping.py b/abis_mapping/templates/survey_site_data_v3/mapping.py index 7d3da6c6..7f96d042 100644 --- a/abis_mapping/templates/survey_site_data_v3/mapping.py +++ b/abis_mapping/templates/survey_site_data_v3/mapping.py @@ -63,13 +63,15 @@ def apply_validation( **kwargs (Any): Additional keyword arguments. Keyword Args: - site_id_map (dict[str, bool]): Site ids present in the occurrence template. + site_id_map (dict[models.identifier.SiteIdentifier, bool]): Site ids present in the occurrence template. Returns: frictionless.Report: Validation report for the specified data. """ # Extract keyword arguments - site_id_map: dict[str, bool] = kwargs.get("site_id_map", {}) + site_id_map: dict[models.identifier.SiteIdentifier, bool] = kwargs.get("site_id_map", {}) + if site_id_map is None: + raise ValueError("If provided, site_id_map must not be None") # Construct schema schema = self.extra_fields_schema( @@ -92,8 +94,22 @@ def apply_validation( # Extra custom checks plugins.tabular.IsTabular(), plugins.empty.NotEmpty(), + # Valid of the ID-related fields + plugins.site_id_or_iri_validation.SiteIdentifierCheck(), + plugins.mutual_inclusion.MutuallyInclusive( + field_names=["siteID", "siteIDSource"], + ), + plugins.unique_together.UniqueTogether( + fields=["siteID", "siteIDSource"], + null_handling="skip", + error_message_template=( + "siteID and siteIDSource must be unique for each Row. " + '[{values}] have already been used in the row at position "{first_seen_row_number}"' + ), + ), + # Other fields' validation plugins.sites_geometry.SitesGeometry( - occurrence_site_ids=set(site_id_map), + occurrence_site_identifiers=site_id_map, ), plugins.mutual_inclusion.MutuallyInclusive( field_names=["relatedSiteID", "relationshipToRelatedSite"], @@ -140,7 +156,7 @@ def extract_site_ids( def extract_geometry_defaults( self, data: base.types.ReadableType, - ) -> dict[str, str]: + ) -> dict[models.identifier.SiteIdentifier, str]: """Constructs a dictionary mapping site id to default WKT. The resulting string WKT returned can then be used as the missing @@ -150,10 +166,10 @@ def extract_geometry_defaults( data (base.types.ReadableType): Raw data to be mapped. Returns: - dict[str, str]: Keys are the site id; values are the - appropriate point WKT serialized string. If none then - there is no siteID key created. Values include the geodetic - datum uri. + Mapping with SiteIdentifier as the keys; values are the + appropriate point WKT serialized string. If none then + there is no siteID key created. Values include the geodetic + datum uri. """ # Construct schema schema = frictionless.Schema.from_descriptor(self.schema()) @@ -169,14 +185,14 @@ def extract_geometry_defaults( # Context manager for row streaming with resource.open() as r: # Create empty dictionary to hold mapping values - result: dict[str, str] = {} + result: dict[models.identifier.SiteIdentifier, str] = {} for row in r.row_stream: # Extract values - site_id: str | None = row["siteID"] + site_identifier = models.identifier.SiteIdentifier.from_row(row) - # Check for siteID, even though siteID is a mandatory field, it can be missing here + # Check there is an identifier, even though it is mandatory field, it can be missing here # because this method is called for cross-validation, regardless of if this template is valid. - if not site_id: + if not site_identifier: continue footprint_wkt: shapely.geometry.base.BaseGeometry | None = row["footprintWKT"] @@ -192,7 +208,7 @@ def extract_geometry_defaults( # Default to using the footprint wkt + geodetic datum if footprint_wkt is not None: # Create string and add to map for site id - result[site_id] = str( + result[site_identifier] = str( models.spatial.Geometry( raw=footprint_wkt.centroid, datum=datum, @@ -203,7 +219,7 @@ def extract_geometry_defaults( # If not footprint then we revert to using supplied longitude & latitude if longitude is not None and latitude is not None: # Create string and add to map for site id - result[site_id] = str( + result[site_identifier] = str( models.spatial.Geometry( raw=shapely.Point([float(longitude), float(latitude)]), datum=datum, @@ -235,8 +251,8 @@ def apply_mapping_row( """ # TERN.Site subject IRI - Note this needs to match the iri construction of the # survey site visit and occurrence template mapping, ensuring they will resolve properly. - site_id: str = row["siteID"] - site = utils.iri_patterns.site_iri(base_iri, site_id) + site_id: str | None = row["siteID"] + site = utils.iri_patterns.site_iri(base_iri, site_id) # type: ignore[arg-type] # TODO fix when doing mapping # Conditionally create uris dependent on siteIDSource site_id_src: str | None = row["siteIDSource"] @@ -430,7 +446,7 @@ def add_site( base_iri: Namespace used to construct IRIs """ # Extract relevant values - site_id = row["siteID"] + site_id: str | None = row["siteID"] site_name = row["siteName"] site_type = row["siteType"] site_description = row["siteDescription"] diff --git a/abis_mapping/templates/survey_site_data_v3/schema.json b/abis_mapping/templates/survey_site_data_v3/schema.json index bb36837a..e3a5b0f8 100644 --- a/abis_mapping/templates/survey_site_data_v3/schema.json +++ b/abis_mapping/templates/survey_site_data_v3/schema.json @@ -3,13 +3,12 @@ { "name": "siteID", "title": "Site ID", - "description": "A unique within dataset string identifier for the site. Valid values include strings that are used specifically for this survey or URIs from BDR Sites that have been established in previous surveys.", + "description": "An identifier for the site. Within the dataset, should be unique per siteIDSource", "example": "P1", "type": "string", "format": "default", "constraints": { - "required": true, - "unique": true + "required": false } }, { @@ -23,6 +22,18 @@ "required": false } }, + { + "name": "existingBDRSiteIRI", + "title": "Existing BDR Site IRI", + "description": "Verbatim IRI of an existing Site in the BDR that new information is being added to. The IRI will typically start with https://linked.data.gov.au/dataset/bdr/. This field can be supplied as an alternative, or in addition, to siteID + siteIDSource when a Site already exists in the BDR.", + "example": "https://linked.data.gov.au/dataset/bdr/site/TERN/P1", + "type": "string", + "format": "uri", + "constraints": { + "required": false, + "unique": true + } + }, { "name": "siteType", "title": "Site Type", @@ -211,7 +222,6 @@ "required": false } } - ], - "primaryKey": "siteID" + ] } diff --git a/abis_mapping/templates/survey_site_data_v3/survey_site_data.csv b/abis_mapping/templates/survey_site_data_v3/survey_site_data.csv index 76b423e1..9ab67580 100644 --- a/abis_mapping/templates/survey_site_data_v3/survey_site_data.csv +++ b/abis_mapping/templates/survey_site_data_v3/survey_site_data.csv @@ -1 +1 @@ -siteID,siteIDSource,siteType,siteName,siteDescription,habitat,relatedSiteID,relationshipToRelatedSite,locality,decimalLatitude,decimalLongitude,footprintWKT,geodeticDatum,coordinateUncertaintyInMeters,dataGeneralizations +siteID,siteIDSource,existingBDRSiteIRI,siteType,siteName,siteDescription,habitat,relatedSiteID,relationshipToRelatedSite,locality,decimalLatitude,decimalLongitude,footprintWKT,geodeticDatum,coordinateUncertaintyInMeters,dataGeneralizations diff --git a/abis_mapping/templates/survey_site_data_v3/templates/instructions.md b/abis_mapping/templates/survey_site_data_v3/templates/instructions.md index 89866069..18f54cab 100644 --- a/abis_mapping/templates/survey_site_data_v3/templates/instructions.md +++ b/abis_mapping/templates/survey_site_data_v3/templates/instructions.md @@ -93,7 +93,23 @@ For example, `fieldNotes`, `continent`, `country`, `countryCode`, `stateProvince ## CHANGELOG -No changes from Systematic Survey Site Data Template v2.0.0 +Changes from Systematic Survey Site Data Template v2.0.0 + +### CHANGED FIELDS + +* Add field [`existingBDRSiteIRI`](#existingBDRSiteIRI-field). Type is URI, can be blank. +Rows with values must be unique within a template. + +### CHANGED VALIDATION + +* [`siteID`](#siteID-field) is no longer required and unique on its own, instead; +* [`siteID`](#siteID-field) and [`siteIDSource`](#siteIDSource-field) are conditionally mandatory. +Must be provided together, or neither provided. +* [`siteID`](#siteID-field) and [`siteIDSource`](#siteIDSource-field) are unique together, +i.e. each row with these fields must have a unique combination. +* Either [`siteID`](#siteID-field) and [`siteIDSource`](#siteIDSource-field), +or [`existingBDRSiteIRI`](#existingBDRSiteIRI-field), +or both, must be provided in each row. ## APPENDICES ### APPENDIX-I: Vocabulary List diff --git a/abis_mapping/templates/survey_site_visit_data_v3/examples/minimal-error-dates-wrong-order.csv b/abis_mapping/templates/survey_site_visit_data_v3/examples/minimal-error-dates-wrong-order.csv index 368fd74c..d9f10443 100644 --- a/abis_mapping/templates/survey_site_visit_data_v3/examples/minimal-error-dates-wrong-order.csv +++ b/abis_mapping/templates/survey_site_visit_data_v3/examples/minimal-error-dates-wrong-order.csv @@ -1,2 +1,2 @@ -surveyID,siteID,siteIDSource,siteVisitID,siteVisitStart,siteVisitEnd,visitOrgs,visitObservers,condition,targetTaxonomicScope,protocolName,protocolDescription,samplingEffortValue,samplingEffortUnit -S1,PLOT1,GAIA,VA-99,2024-10-01,2024-09-22,GAIA,John Smith,Burnt,Coleoptera,harpTrapping,Three conventional harp traps,20 x 12,trapDays +surveyID,siteID,siteIDSource,existingBDRSiteIRI,siteVisitID,siteVisitStart,siteVisitEnd,visitOrgs,visitObservers,condition,targetTaxonomicScope,protocolName,protocolDescription,samplingEffortValue,samplingEffortUnit +S1,PLOT1,GAIA,,VA-99,2024-10-01,2024-09-22,GAIA,John Smith,Burnt,Coleoptera,harpTrapping,Three conventional harp traps,20 x 12,trapDays diff --git a/abis_mapping/templates/survey_site_visit_data_v3/examples/minimal-error-no-dates.csv b/abis_mapping/templates/survey_site_visit_data_v3/examples/minimal-error-no-dates.csv index e7fb3438..d6dae046 100644 --- a/abis_mapping/templates/survey_site_visit_data_v3/examples/minimal-error-no-dates.csv +++ b/abis_mapping/templates/survey_site_visit_data_v3/examples/minimal-error-no-dates.csv @@ -1,2 +1,2 @@ -surveyID,siteID,siteIDSource,siteVisitID,siteVisitStart,siteVisitEnd,visitOrgs,visitObservers,condition,targetTaxonomicScope,protocolName,protocolDescription,samplingEffortValue,samplingEffortUnit -S1,PLOT1,GAIA,VA-99,,,GAIA,John Smith,Burnt,Coleoptera,harpTrapping,Three conventional harp traps,20 x 12,trapDays +surveyID,siteID,siteIDSource,existingBDRSiteIRI,siteVisitID,siteVisitStart,siteVisitEnd,visitOrgs,visitObservers,condition,targetTaxonomicScope,protocolName,protocolDescription,samplingEffortValue,samplingEffortUnit +S1,PLOT1,GAIA,,VA-99,,,GAIA,John Smith,Burnt,Coleoptera,harpTrapping,Three conventional harp traps,20 x 12,trapDays diff --git a/abis_mapping/templates/survey_site_visit_data_v3/examples/minimal.csv b/abis_mapping/templates/survey_site_visit_data_v3/examples/minimal.csv index f0ecd031..54e16c72 100644 --- a/abis_mapping/templates/survey_site_visit_data_v3/examples/minimal.csv +++ b/abis_mapping/templates/survey_site_visit_data_v3/examples/minimal.csv @@ -1,4 +1,4 @@ -surveyID,siteID,siteIDSource,siteVisitID,siteVisitStart,siteVisitEnd,visitOrgs,visitObservers,condition,targetTaxonomicScope,protocolName,protocolDescription,samplingEffortValue,samplingEffortUnit -TIS-24-03,P1,WAM,TIS-24-03-P1-01,2024-03-12,2024-04-04,WAM | DBCA,ORCID00001 | ORCID00002,dry,new_taxon,wet pitfall trap,10 x square buckets of size 20 x 20 x 15 cm. Propylene glycol.,240,trap nights -TIS-24-03,P1,WAM,TIS-24-03-P1-02,2024-03-12,2024-03-12,WAM,ORCID00001,moist leaf litter after recent rain,invertebrate,litter sifting,50 cm diameter sifter with 5 mm mesh. Litter samles taken ~1 metre from each pitfall trap,10,sifts -TIS-24-03,P1,WAM,TIS-24-03-P1-03,2024-03-12,,WAM,ORCID00003,,bird,human observation,,, +surveyID,siteID,siteIDSource,existingBDRSiteIRI,siteVisitID,siteVisitStart,siteVisitEnd,visitOrgs,visitObservers,condition,targetTaxonomicScope,protocolName,protocolDescription,samplingEffortValue,samplingEffortUnit +TIS-24-03,P1,WAM,,TIS-24-03-P1-01,2024-03-12,2024-04-04,WAM | DBCA,ORCID00001 | ORCID00002,dry,new_taxon,wet pitfall trap,10 x square buckets of size 20 x 20 x 15 cm. Propylene glycol.,240,trap nights +TIS-24-03,P1,WAM,,TIS-24-03-P1-02,2024-03-12,2024-03-12,WAM,ORCID00001,moist leaf litter after recent rain,invertebrate,litter sifting,50 cm diameter sifter with 5 mm mesh. Litter samles taken ~1 metre from each pitfall trap,10,sifts +TIS-24-03,P1,WAM,,TIS-24-03-P1-03,2024-03-12,,WAM,ORCID00003,,bird,human observation,,, diff --git a/abis_mapping/templates/survey_site_visit_data_v3/mapping.py b/abis_mapping/templates/survey_site_visit_data_v3/mapping.py index a830ce7f..295ac4d0 100644 --- a/abis_mapping/templates/survey_site_visit_data_v3/mapping.py +++ b/abis_mapping/templates/survey_site_visit_data_v3/mapping.py @@ -69,6 +69,10 @@ def apply_validation( checks = [ plugins.tabular.IsTabular(), plugins.empty.NotEmpty(), + plugins.mutual_inclusion.MutuallyInclusive( + field_names=["siteID", "siteIDSource"], + ), + plugins.site_id_or_iri_validation.SiteIdentifierCheck(), plugins.chronological.ChronologicalOrder( field_names=["siteVisitStart", "siteVisitEnd"], ), @@ -97,14 +101,15 @@ def apply_validation( def extract_site_visit_id_to_site_id_map( self, data: base.types.ReadableType, - ) -> dict[str, str]: - """Constructs a dictionary mapping site visit id to site id. + ) -> dict[str, models.identifier.SiteIdentifier | None]: + """Constructs a dictionary mapping site visit id to SiteIdentifier. Args: data: Raw data to be mapped. Returns: - Map with site visit id for keys and site id for values. + Map with site visit id for keys and SiteIdentifier for values, + or None for value if there is no identifier. """ # Construct schema schema = frictionless.Schema.from_descriptor(self.schema()) @@ -113,14 +118,20 @@ def extract_site_visit_id_to_site_id_map( resource = frictionless.Resource(source=data, format="csv", schema=schema, encoding="utf-8") # Declare result reference - result: dict[str, str] = {} + result: dict[str, models.identifier.SiteIdentifier | None] = {} # Context manager for row streaming with resource.open() as r: for row in r.row_stream: # Check that the cells have values and add to map - if (svid := row["siteVisitID"]) is not None and (sid := row["siteID"]) is not None: - result[svid] = sid + site_visit_id: str | None = row["siteVisitID"] + site_identifier = models.identifier.SiteIdentifier.from_row(row) + # Put siteVisitID in the map, even when site_identifier is None, + # So the other templates have access to all the provided siteVisitIDs. + # This lets other templates differentiate between 'a siteVisitID not in this template', + # and 'a siteVisitID in this template but with no Site identifier'. + if site_visit_id: + result[site_visit_id] = site_identifier # Return return result diff --git a/abis_mapping/templates/survey_site_visit_data_v3/schema.json b/abis_mapping/templates/survey_site_visit_data_v3/schema.json index 701d8dbf..a420a9bf 100644 --- a/abis_mapping/templates/survey_site_visit_data_v3/schema.json +++ b/abis_mapping/templates/survey_site_visit_data_v3/schema.json @@ -14,12 +14,12 @@ { "name": "siteID", "title": "Site ID", - "description": "A unique within dataset string identifier for the site. Valid values include strings that are used specifically for this survey or URIs from BDR Sites that have been established in previous surveys.", + "description": "Corresponds to a unique site identifier, provided within accompanying survey_site_data.csv template.", "example": "P1", "type": "string", "format": "default", "constraints": { - "required": true + "required": false } }, { @@ -33,6 +33,17 @@ "required": false } }, + { + "name": "existingBDRSiteIRI", + "title": "Existing BDR Site IRI", + "description": "Corresponds to a unique site IRI, provided within accompanying survey_site_data.csv template.", + "example": "https://linked.data.gov.au/dataset/bdr/site/TERN/P1", + "type": "string", + "format": "uri", + "constraints": { + "required": false + } + }, { "name": "siteVisitID", "title": "Site Visit ID", @@ -172,6 +183,20 @@ "resource": "survey_metadata", "fields": "surveyID" } + }, + { + "fields": ["siteID", "siteIDSource"], + "reference": { + "resource": "survey_site_data", + "fields": ["siteID", "siteIDSource"] + } + }, + { + "fields": "existingBDRSiteIRI", + "reference": { + "resource": "survey_site_data", + "fields": "existingBDRSiteIRI" + } } ] } diff --git a/abis_mapping/templates/survey_site_visit_data_v3/survey_site_visit_data.csv b/abis_mapping/templates/survey_site_visit_data_v3/survey_site_visit_data.csv index e24bff04..5e33dda6 100644 --- a/abis_mapping/templates/survey_site_visit_data_v3/survey_site_visit_data.csv +++ b/abis_mapping/templates/survey_site_visit_data_v3/survey_site_visit_data.csv @@ -1 +1 @@ -surveyID,siteID,siteIDSource,siteVisitID,siteVisitStart,siteVisitEnd,visitOrgs,visitObservers,condition,targetTaxonomicScope,protocolName,protocolDescription,samplingEffortValue,samplingEffortUnit +surveyID,siteID,siteIDSource,existingBDRSiteIRI,siteVisitID,siteVisitStart,siteVisitEnd,visitOrgs,visitObservers,condition,targetTaxonomicScope,protocolName,protocolDescription,samplingEffortValue,samplingEffortUnit diff --git a/abis_mapping/templates/survey_site_visit_data_v3/templates/instructions.md b/abis_mapping/templates/survey_site_visit_data_v3/templates/instructions.md index 598815e2..3273418c 100644 --- a/abis_mapping/templates/survey_site_visit_data_v3/templates/instructions.md +++ b/abis_mapping/templates/survey_site_visit_data_v3/templates/instructions.md @@ -93,8 +93,18 @@ Changes from Systematic Survey Site Visit Data Template v2.0.0 ### CHANGED FIELDS +* Add field [`existingBDRSiteIRI`](#existingBDRSiteIRI-field). + +### CHANGED VALIDATION + * [`surveyID`](#surveyID-field) Is now a **mandatory** field, and every row must have a value that matches a `surveyID` in the Systematic Survey Metadata template to indicate which Survey the Site Visit is related to. +* [`siteID`](#siteID-field) is no longer required on its own, instead; +* [`siteID`](#siteID-field) and [`siteIDSource`](#siteIDSource-field) are conditionally mandatory. +Must be provided together, or neither provided. +* Either [`siteID`](#siteID-field) and [`siteIDSource`](#siteIDSource-field), +or [`existingBDRSiteIRI`](#existingBDRSiteIRI-field), +or both, must be provided in each row. ## APPENDICES ### APPENDIX-I: Vocabulary List diff --git a/tests/plugins/test_default_lookup.py b/tests/plugins/test_default_lookup.py index c01477ad..3c14c1f2 100644 --- a/tests/plugins/test_default_lookup.py +++ b/tests/plugins/test_default_lookup.py @@ -46,3 +46,78 @@ def test_default_lookup() -> None: # Check assert not report.valid assert len(report.flatten()) == 2 + + +def test_default_lookup_with_callable_key_valid() -> None: + """Tests that the default lookup check plugin with a callable key.""" + # Default map + default_map: dict[object, str] = { + "A1": "10", + "B2": "20", + } + + # Construct fake resource + resource = frictionless.Resource( + source=[ + # Valid + {"value": None, "letter": "A", "number": "1"}, + {"value": None, "letter": "B", "number": "2"}, + ] + ) + + # Validate + report = resource.validate( + checklist=frictionless.Checklist( + checks=[ + plugins.default_lookup.DefaultLookup( + value_field="value", + key_field=lambda row: row["letter"] + str(row["number"]), + default_map=default_map, + no_key_error_template="", + no_default_error_template="", + ) + ] + ) + ) + + # Check + assert report.valid + + +def test_default_lookup_with_callable_key_invalid() -> None: + """Tests that the default lookup check plugin with a callable key, and invalid data.""" + # Default map + default_map: dict[object, str] = { + "AA": "10", + } + + # Construct fake resource + resource = frictionless.Resource( + source=[ + # Invalid + {"id": "1", "value": None, "letter": None}, + {"id": "2", "value": None, "letter": "B"}, + ] + ) + + # Validate + report = resource.validate( + checklist=frictionless.Checklist( + checks=[ + plugins.default_lookup.DefaultLookup( + value_field="value", + key_field=lambda row: (letter + letter) if (letter := row["letter"]) else None, + default_map=default_map, + no_key_error_template="No key found", + no_default_error_template="No default found for {key_value}", + ) + ] + ) + ) + + # Check + assert not report.valid + assert len(report.tasks) == 1 + assert len(report.tasks[0].errors) == 2 + assert report.tasks[0].errors[0].message == "The row at position 2 has an error: No key found" + assert report.tasks[0].errors[1].message == "The row at position 3 has an error: No default found for BB" diff --git a/tests/plugins/test_site_geometry.py b/tests/plugins/test_site_geometry.py index 6eb261f4..b95962e6 100644 --- a/tests/plugins/test_site_geometry.py +++ b/tests/plugins/test_site_geometry.py @@ -6,6 +6,7 @@ import attrs # Local +from abis_mapping import models from abis_mapping import plugins # Typing @@ -128,3 +129,133 @@ def test_check_site_geometry_valid(self, source: dict[str, Any], site_ids: set[s # Assert assert report.valid == valid + + +def test_site_geometry_with_site_identifiers() -> None: + """Tests the site geometry checker with site identifiers.""" + # Construct fake resource + resource = frictionless.Resource( + source=[ + # valid with location + { + "decimalLatitude": "40", + "decimalLongitude": "40", + "footprintWKT": None, + "geodeticDatum": "GDA2020", + "siteID": None, + "siteIDSource": None, + "existingBDRSiteIRI": None, + }, + { + "decimalLatitude": None, + "decimalLongitude": None, + "footprintWKT": "POINT (20, 20)", + "geodeticDatum": "GDA2020", + "siteID": None, + "siteIDSource": None, + "existingBDRSiteIRI": None, + }, + # valid with a site in occurrence file + { + "decimalLatitude": None, + "decimalLongitude": None, + "footprintWKT": None, + "geodeticDatum": None, + "siteID": "S1", + "siteIDSource": "ORG", + "existingBDRSiteIRI": None, + }, + { + "decimalLatitude": None, + "decimalLongitude": None, + "footprintWKT": None, + "geodeticDatum": None, + "siteID": None, + "siteIDSource": None, + "existingBDRSiteIRI": "SITE-IRI", + }, + ], + ) + + # Validate + report = resource.validate( + checklist=frictionless.Checklist( + checks=[ + plugins.sites_geometry.SitesGeometry( + occurrence_site_identifiers={ + models.identifier.SiteIdentifier( + site_id="S1", site_id_source="ORG", existing_bdr_site_iri=None + ), + models.identifier.SiteIdentifier( + site_id=None, site_id_source=None, existing_bdr_site_iri="SITE-IRI" + ), + } + ) + ] + ) + ) + + # Assert + assert report.valid + + +def test_site_geometry_with_site_identifiers_invalid_data() -> None: + """Tests the site geometry checker with site identifiers and invalid data.""" + # Construct fake resource + resource = frictionless.Resource( + source=[ + # invalid, no complete location or site + { + "decimalLatitude": "30", + "decimalLongitude": None, + "footprintWKT": None, + "geodeticDatum": None, + "siteID": None, + "siteIDSource": None, + "existingBDRSiteIRI": None, + }, + # invalid, no location and site not in occurrence file + { + "decimalLatitude": None, + "decimalLongitude": None, + "footprintWKT": None, + "geodeticDatum": None, + "siteID": "S2", + "siteIDSource": "ORG", + "existingBDRSiteIRI": None, + }, + { + "decimalLatitude": None, + "decimalLongitude": None, + "footprintWKT": None, + "geodeticDatum": None, + "siteID": None, + "siteIDSource": None, + "existingBDRSiteIRI": "SITE-IRI-2", + }, + ], + ) + + # Validate + report = resource.validate( + checklist=frictionless.Checklist( + checks=[ + plugins.sites_geometry.SitesGeometry( + occurrence_site_identifiers={ + models.identifier.SiteIdentifier( + site_id="S1", site_id_source="ORG", existing_bdr_site_iri=None + ), + models.identifier.SiteIdentifier( + site_id=None, site_id_source=None, existing_bdr_site_iri="SITE-IRI" + ), + } + ) + ] + ) + ) + + # Assert + assert not report.valid + assert len(report.tasks) == 1 + assert len(report.tasks[0].errors) == 3 + assert [error.type for error in report.tasks[0].errors] == ["row-constraint", "row-constraint", "row-constraint"] diff --git a/tests/plugins/test_site_id_or_iri_validation.py b/tests/plugins/test_site_id_or_iri_validation.py new file mode 100644 index 00000000..088f6910 --- /dev/null +++ b/tests/plugins/test_site_id_or_iri_validation.py @@ -0,0 +1,191 @@ +"""Provides Unit Tests for the `abis_mapping.plugins.site_id_or_iri_validation` module""" + +# Third-Party +import frictionless + +# Local +from abis_mapping import plugins + + +def test_with_valid_data() -> None: + """Tests the SiteIdentifierCheck Checker with valid data""" + # Construct Fake Resource + resource = frictionless.Resource( + source=[ + { + "rowID": "1", + "siteID": "P1", + "siteIDSource": "TERN", + "existingBDRSiteIRI": None, + }, + { + "rowID": "2", + "siteID": None, + "siteIDSource": None, + "existingBDRSiteIRI": "https://linked.data.gov.au/dataset/bdr/site/TERN/P2", + }, + { + "rowID": "3", + "siteID": "P3", + "siteIDSource": "TERN", + "existingBDRSiteIRI": "https://linked.data.gov.au/dataset/bdr/site/TERN/P3", + }, + ], + ) + + # Validate + report: frictionless.Report = resource.validate( + checklist=frictionless.Checklist( + checks=[ + plugins.site_id_or_iri_validation.SiteIdentifierCheck(), + ], + ), + ) + + # Check + assert report.valid + + +def test_with_invalid_data() -> None: + """Tests the SiteIdentifierCheck Checker with invalid data""" + resource = frictionless.Resource( + source=[ + { + "rowID": "1", + "siteID": None, + "siteIDSource": None, + "existingBDRSiteIRI": None, + }, + { + "rowID": "2", + "siteID": "P1", + "siteIDSource": None, + "existingBDRSiteIRI": None, + }, + { + "rowID": "3", + "siteID": None, + "siteIDSource": "TERN", + "existingBDRSiteIRI": None, + }, + ], + ) + + # Validate + report: frictionless.Report = resource.validate( + checklist=frictionless.Checklist( + checks=[ + plugins.site_id_or_iri_validation.SiteIdentifierCheck(), + ], + ), + ) + + # Check + assert not report.valid + assert len(report.tasks) == 1 + assert len(report.tasks[0].errors) == 3 + assert report.tasks[0].errors[0].message == ( + "The row at position 2 has an error: Either siteID and siteIDSource, or existingBDRSiteIRI must be provided." + ) + assert report.tasks[0].errors[1].message == ( + "The row at position 3 has an error: Either siteID and siteIDSource, or existingBDRSiteIRI must be provided." + ) + assert report.tasks[0].errors[2].message == ( + "The row at position 4 has an error: Either siteID and siteIDSource, or existingBDRSiteIRI must be provided." + ) + + +def test_with_valid_data_with_skip_field() -> None: + """Tests the SiteIdentifierCheck Checker with valid data and a skip_when_missing field.""" + # Construct Fake Resource + resource = frictionless.Resource( + source=[ + { + "rowID": "1", + "some_field": "...", + "siteID": "P1", + "siteIDSource": "TERN", + "existingBDRSiteIRI": None, + }, + { + "rowID": "2", + "some_field": "...", + "siteID": None, + "siteIDSource": None, + "existingBDRSiteIRI": "https://linked.data.gov.au/dataset/bdr/site/TERN/P2", + }, + { + "rowID": "3", + "some_field": "...", + "siteID": "P3", + "siteIDSource": "TERN", + "existingBDRSiteIRI": "https://linked.data.gov.au/dataset/bdr/site/TERN/P3", + }, + # valid, but is not checked anyway because some_field is null + { + "rowID": "4", + "some_field": None, + "siteID": "P3", + "siteIDSource": "TERN", + "existingBDRSiteIRI": "https://linked.data.gov.au/dataset/bdr/site/TERN/P3", + }, + # invalid, but not checked because some_field is null + { + "rowID": "5", + "some_field": None, + "siteID": None, + "siteIDSource": None, + "existingBDRSiteIRI": None, + }, + ], + ) + + # Validate + report: frictionless.Report = resource.validate( + checklist=frictionless.Checklist( + checks=[ + plugins.site_id_or_iri_validation.SiteIdentifierCheck( + skip_when_missing="some_field", + ), + ], + ), + ) + + # Check + assert report.valid + + +def test_with_invalid_data_with_skip_field() -> None: + """Tests the SiteIdentifierCheck Checker with invalid data and a skip_when_missing field""" + resource = frictionless.Resource( + source=[ + # invalid, and is checked because some_field has a value + { + "rowID": "1", + "some_field": "...", + "siteID": None, + "siteIDSource": None, + "existingBDRSiteIRI": None, + }, + ], + ) + + # Validate + report: frictionless.Report = resource.validate( + checklist=frictionless.Checklist( + checks=[ + plugins.site_id_or_iri_validation.SiteIdentifierCheck( + skip_when_missing="some_field", + ), + ], + ), + ) + + # Check + assert not report.valid + assert len(report.tasks) == 1 + assert len(report.tasks[0].errors) == 1 + assert report.tasks[0].errors[0].message == ( + "The row at position 2 has an error: Either siteID and siteIDSource, " + "or existingBDRSiteIRI must be provided, when some_field is provided." + ) diff --git a/tests/plugins/test_site_identifier_match.py b/tests/plugins/test_site_identifier_match.py new file mode 100644 index 00000000..ab496f66 --- /dev/null +++ b/tests/plugins/test_site_identifier_match.py @@ -0,0 +1,108 @@ +"""Provides Unit Tests for the `abis_mapping.plugins.site_identifier_match` module""" + +# Third-Party +import frictionless + +# Local +from abis_mapping import models +from abis_mapping import plugins + + +def test_site_identifier_match_with_valid_data() -> None: + """Tests the SiteIdentifierMatches Checker with valid data""" + # Construct Fake Resource + resource = frictionless.Resource( + source=[ + # No siteVisitID, check is skipped + {"rowID": "1", "siteVisitID": None, "siteID": None, "siteIDSource": None, "existingBDRSiteIRI": None}, + # No SiteIdentifier, check is skipped + {"rowID": "2", "siteVisitID": "V1", "siteID": None, "siteIDSource": None, "existingBDRSiteIRI": None}, + # siteVisitID is None in map, no error in this check/template. + {"rowID": "3", "siteVisitID": "V2", "siteID": "S1", "siteIDSource": "TEST", "existingBDRSiteIRI": None}, + # valid data with matching SiteIdentifier + {"rowID": "4", "siteVisitID": "V3", "siteID": "S1", "siteIDSource": "TEST", "existingBDRSiteIRI": None}, + {"rowID": "5", "siteVisitID": "V4", "siteID": None, "siteIDSource": None, "existingBDRSiteIRI": "TEST-IRI"}, + # siteID fields ignored when existingBDRSiteIRI is present. + {"rowID": "6", "siteVisitID": "V4", "siteID": "AA", "siteIDSource": "BB", "existingBDRSiteIRI": "TEST-IRI"}, + ], + ) + # Fake map from site visit data template. + site_visit_id_site_id_map = { + "V2": None, + "V3": models.identifier.SiteIdentifier(site_id="S1", site_id_source="TEST", existing_bdr_site_iri=None), + "V4": models.identifier.SiteIdentifier(site_id=None, site_id_source=None, existing_bdr_site_iri="TEST-IRI"), + } + + # Validate + report: frictionless.Report = resource.validate( + checklist=frictionless.Checklist( + checks=[ + plugins.site_identifier_match.SiteIdentifierMatches( + site_visit_id_site_id_map=site_visit_id_site_id_map, + ), + ], + ), + ) + + # Check + assert report.valid + + +def test_site_identifier_match_with_invalid_data() -> None: + """Tests the SiteIdentifierMatches Checker with invalid data""" + # Construct Fake Resource + resource = frictionless.Resource( + source=[ + # siteVisitID not in map, that's an error + { + "rowID": "7", + "siteVisitID": "UNKNOWN", + "siteID": "S1", + "siteIDSource": "TEST", + "existingBDRSiteIRI": None, + }, + # Not matching SiteIdentifiers, that's an error + {"rowID": "8", "siteVisitID": "V5", "siteID": "S1", "siteIDSource": "TEST", "existingBDRSiteIRI": None}, + { + "rowID": "9", + "siteVisitID": "V6", + "siteID": None, + "siteIDSource": None, + "existingBDRSiteIRI": "TEST-IRI-2", + }, + ], + ) + # Fake map from site visit data template. + site_visit_id_site_id_map = { + "V5": models.identifier.SiteIdentifier(site_id="S2", site_id_source="TEST", existing_bdr_site_iri=None), + "V6": models.identifier.SiteIdentifier(site_id=None, site_id_source=None, existing_bdr_site_iri="TEST-IRI"), + } + + # Validate + report: frictionless.Report = resource.validate( + checklist=frictionless.Checklist( + checks=[ + plugins.site_identifier_match.SiteIdentifierMatches( + site_visit_id_site_id_map=site_visit_id_site_id_map, + ), + ], + ), + ) + + # Check + assert not report.valid + assert len(report.tasks) == 1 + assert len(report.tasks[0].errors) == 3 + assert report.tasks[0].errors[0].message == ( + 'The cell "UNKNOWN" in row at position "2" and field "siteVisitID" ' + 'at position "2" does not conform to a constraint: ' + "siteVisitID must match a siteVisitID in the survey_site_visit_data template" + ) + assert report.tasks[0].errors[1].message == ( + "The row at position 3 has an error: siteID and siteIDSource must match their " + 'values in the survey_site_visit_data template at the row with siteVisitID "V5".' + ) + assert report.tasks[0].errors[2].message == ( + "The row at position 4 has an error: existingBDRSiteIRI must match their " + 'values in the survey_site_visit_data template at the row with siteVisitID "V6".' + ) diff --git a/tests/plugins/test_unique_together.py b/tests/plugins/test_unique_together.py new file mode 100644 index 00000000..945fd8e3 --- /dev/null +++ b/tests/plugins/test_unique_together.py @@ -0,0 +1,188 @@ +"""Provides Unit Tests for the `abis_mapping.plugins.unique_together` module""" + +# Third-Party +import frictionless + +# Local +from abis_mapping import plugins + + +def test_unique_together_valid_nulls_skipped() -> None: + """Tests the UniqueTogether Checker with valid data""" + # Construct Fake Resource + resource = frictionless.Resource( + source=[ + # unique rows + {"rowID": "1", "ID": "A1", "source": "Z1"}, + {"rowID": "2", "ID": "A1", "source": "Z2"}, + {"rowID": "3", "ID": "A2", "source": "Z1"}, + {"rowID": "4", "ID": "A2", "source": "Z2"}, + # rows with None are skipped, can be duplicates + {"rowID": "5", "ID": "A1", "source": None}, + {"rowID": "6", "ID": "A1", "source": None}, + {"rowID": "7", "ID": None, "source": "Z2"}, + {"rowID": "8", "ID": None, "source": "Z2"}, + {"rowID": "9", "ID": None, "source": None}, + {"rowID": "0", "ID": None, "source": None}, + ], + ) + + # Validate + report: frictionless.Report = resource.validate( + checklist=frictionless.Checklist( + checks=[ + plugins.unique_together.UniqueTogether( + fields=["ID", "source"], + null_handling="skip", + ), + ], + ), + ) + + # Check + assert report.valid + + +def test_unique_together_valid_nulls_included() -> None: + """Tests the UniqueTogether Checker with valid data""" + # Construct Fake Resource + resource = frictionless.Resource( + source=[ + # unique rows + {"rowID": "1", "ID": "A1", "source": "Z1"}, + {"rowID": "2", "ID": "A1", "source": "Z2"}, + {"rowID": "3", "ID": "A2", "source": "Z1"}, + {"rowID": "4", "ID": "A2", "source": "Z2"}, + # rows with None are checked, can't be duplicates + {"rowID": "5", "ID": "A1", "source": None}, + {"rowID": "6", "ID": None, "source": "Z2"}, + {"rowID": "7", "ID": None, "source": None}, + ], + ) + + # Validate + report: frictionless.Report = resource.validate( + checklist=frictionless.Checklist( + checks=[ + plugins.unique_together.UniqueTogether( + fields=["ID", "source"], + null_handling="include", + ), + ], + ), + ) + + # Check + assert report.valid + + +def test_unique_together_invalid() -> None: + """Tests the UniqueTogether Checker with invalid data""" + # Construct Fake Resource + resource = frictionless.Resource( + source=[ + {"rowID": "1", "ID": "A1", "source": "Z1"}, + {"rowID": "2", "ID": "A1", "source": "Z1"}, # invalid, copies row 1 + {"rowID": "3", "ID": "A1", "source": "Z2"}, + {"rowID": "4", "ID": "A1", "source": "Z2"}, # invalid, copies row 3 + {"rowID": "5", "ID": "A1", "source": "Z3"}, + {"rowID": "6", "ID": "A1", "source": "Z1"}, # invalid, copies row 1 + ], + ) + + # Validate + report: frictionless.Report = resource.validate( + checklist=frictionless.Checklist( + checks=[ + plugins.unique_together.UniqueTogether( + fields=["ID", "source"], + null_handling="skip", + ), + ], + ), + ) + + # Check + assert not report.valid + assert len(report.tasks) == 1 + assert len(report.tasks[0].errors) == 3 + assert report.tasks[0].errors[0].message == ( + 'Row at position "3" violates the unique together constraint: ' + "The unique together fields [ID, source] contain the values [A1, Z1] " + 'that have already been used in the row at position "2"' + ) + assert report.tasks[0].errors[1].message == ( + 'Row at position "5" violates the unique together constraint: ' + "The unique together fields [ID, source] contain the values [A1, Z2] " + 'that have already been used in the row at position "4"' + ) + assert report.tasks[0].errors[2].message == ( + 'Row at position "7" violates the unique together constraint: ' + "The unique together fields [ID, source] contain the values [A1, Z1] " + 'that have already been used in the row at position "2"' + ) + + +def test_unique_together_invalid_nulls_included() -> None: + """Tests the UniqueTogether Checker with invalid data including nulls""" + # Construct Fake Resource + resource = frictionless.Resource( + source=[ + {"rowID": "1", "ID": "A1", "source": None}, + {"rowID": "2", "ID": "A1", "source": None}, # invalid, copies row 1 + ], + ) + + # Validate + report: frictionless.Report = resource.validate( + checklist=frictionless.Checklist( + checks=[ + plugins.unique_together.UniqueTogether( + fields=["ID", "source"], + null_handling="include", + ), + ], + ), + ) + + # Check + assert not report.valid + assert len(report.tasks) == 1 + assert len(report.tasks[0].errors) == 1 + assert report.tasks[0].errors[0].message == ( + 'Row at position "3" violates the unique together constraint: ' + "The unique together fields [ID, source] contain the values [A1, None] " + 'that have already been used in the row at position "2"' + ) + + +def test_unique_together_invalid_custom_template() -> None: + """Tests the UniqueTogether Checker with invalid data and a custom error""" + # Construct Fake Resource + resource = frictionless.Resource( + source=[ + {"rowID": "1", "ID": "A1", "source": "Z1"}, + {"rowID": "2", "ID": "A1", "source": "Z1"}, # invalid, copies row 1 + ], + ) + + # Validate + report: frictionless.Report = resource.validate( + checklist=frictionless.Checklist( + checks=[ + plugins.unique_together.UniqueTogether( + fields=["ID", "source"], + null_handling="skip", + error_message_template="FIELDS: {fields} VALUES: {values} ROW: {first_seen_row_number}", + ), + ], + ), + ) + + # Check + assert not report.valid + assert len(report.tasks) == 1 + assert len(report.tasks[0].errors) == 1 + assert report.tasks[0].errors[0].message == ( + 'Row at position "3" violates the unique together constraint: FIELDS: ID, source VALUES: A1, Z1 ROW: 2' + ) diff --git a/tests/templates/conftest.py b/tests/templates/conftest.py index 1284e90c..a56a4711 100644 --- a/tests/templates/conftest.py +++ b/tests/templates/conftest.py @@ -327,7 +327,7 @@ class TemplateTestParameters: ), expected=None, should_validate=False, - expected_error_codes={"unique-error", "primary-key"}, + expected_error_codes={"unique-together"}, ), ], metadata_sampling_type="systematic survey", diff --git a/tests/templates/test_survey_occurrence_data_v3.py b/tests/templates/test_survey_occurrence_data_v3.py index 4ac4d0dc..13a47a45 100644 --- a/tests/templates/test_survey_occurrence_data_v3.py +++ b/tests/templates/test_survey_occurrence_data_v3.py @@ -86,114 +86,132 @@ class Scenario: name: str raws: list[list[str]] - expected_error_codes: set[str] = set() - default_map: dict[str, str] + expected_error_codes: list[str] | None + default_map: dict[models.identifier.SiteIdentifier, str] # List of scenarios for the apply_validation method tests scenarios: list[Scenario] = [ Scenario( name="valid_with_default_map", raws=[ - ["site1", "-38.94", "115.21", "WGS84", "", "", "", ""], - ["site1", "", "", "", "", "", "", ""], - ["site2", "-38.94", "115.21", "WGS84", "", "", "", ""], - ["site3", "-38.94", "115.21", "AGD66", "", "", "", ""], - ["site4", "-38.94", "115.21", "EPSG:4202", "", "", "", ""], + ["R1", "site1", "ORG", "", "-38.94", "115.21", "WGS84"], + ["R2", "site1", "ORG", "", "", "", ""], + ["R3", "site2", "ORG", "", "-38.94", "115.21", "WGS84"], + ["R4", "site3", "ORG", "", "-38.94", "115.21", "AGD66"], + ["R5", "site4", "ORG", "", "-38.94", "115.21", "EPSG:4202"], ], - default_map={"site1": "something"}, + default_map={ + ( + models.identifier.SiteIdentifier(site_id="site1", site_id_source="ORG", existing_bdr_site_iri=None) + ): "something", + }, + expected_error_codes=None, ), Scenario( name="invalid_missing_from_default_map", raws=[ - ["site1", "-38.94", "115.21", "WGS84", "", "", "", ""], - ["site1", "", "", "", "", "", "", ""], - ["site2", "-38.94", "115.21", "WGS84", "", "", "", ""], + ["R1", "site1", "ORG", "", "-38.94", "115.21", "WGS84"], + ["R2", "site1", "ORG", "", "", "", ""], + ["R3", "site2", "ORG", "", "-38.94", "115.21", "WGS84"], ], - default_map={"site3": "something"}, - expected_error_codes={"row-constraint"}, + default_map={}, + expected_error_codes=["row-constraint"], ), Scenario( name="invalid_survey_occurrence_requires_latlong", raws=[ - ["site1", "-38.94", "115.21", "WGS84", "", "", "", ""], - ["", "", "", "", "", "", "VU", "VIC"], - ["site2", "-38.94", "115.21", "WGS84", "", "", "", ""], + ["R1", "site1", "ORG", "", "-38.94", "115.21", "WGS84"], + ["R2", "", "", "", "", "", ""], + ["R3", "site2", "ORG", "", "-38.94", "115.21", "WGS84"], ], default_map={}, - expected_error_codes={"row-constraint"}, + expected_error_codes=["row-constraint"], ), Scenario( name="valid_survey_occurrence_requires_latlong", raws=[ - ["site1", "-38.94", "115.21", "WGS84", "", "", "", ""], - ["", "-38.94", "115.21", "WGS84", "", "", "VU", "VIC"], - ["site2", "-38.94", "115.21", "WGS84", "", "", "", ""], + ["R1", "site1", "ORG", "", "-38.94", "115.21", "WGS84"], + ["R2", "", "", "", "-38.94", "115.21", "WGS84"], + ["R3", "site2", "ORG", "", "-38.94", "115.21", "WGS84"], # The following show that non-url safe characters get encoded during mapping. - ["site a", "-38.94", "115.21", "WGS84", "", "", "", ""], - ["site/b", "-38.94", "115.21", "WGS84", "", "", "", ""], - ["site%20c", "-38.94", "115.21", "WGS84", "", "", "", ""], + ["R4", "site a", "ORG", "", "-38.94", "115.21", "WGS84"], + ["R5", "site/b", "ORG", "", "-38.94", "115.21", "WGS84"], + ["R6", "site%20c", "ORG", "", "-38.94", "115.21", "WGS84"], ], default_map={}, + expected_error_codes=None, ), Scenario( name="invalid_missing_long", raws=[ - ["site1", "-38.94", "115.21", "WGS84", "", "", "", ""], - ["site1", "-38.94", "", "WGS84", "", "", "", ""], - ["site2", "-38.94", "115.21", "WGS84", "", "", "", ""], + ["R1", "site1", "ORG", "", "-38.94", "115.21", "WGS84"], + ["R2", "site1", "ORG", "", "-38.94", "", "WGS84"], + ["R3", "site2", "ORG", "", "-38.94", "115.21", "WGS84"], ], - default_map={"site1": "something"}, - expected_error_codes={"row-constraint"}, + default_map={ + ( + models.identifier.SiteIdentifier(site_id="site1", site_id_source="ORG", existing_bdr_site_iri=None) + ): "something", + }, + expected_error_codes=["row-constraint"], ), Scenario( name="invalid_missing_lat", raws=[ - ["site1", "-38.94", "115.21", "WGS84", "", "", "", ""], - ["site1", "", "115.21", "WGS84", "", "", "", ""], - ["site2", "-38.94", "115.21", "WGS84", "", "", "", ""], + ["R1", "site1", "ORG", "", "-38.94", "115.21", "WGS84"], + ["R2", "site1", "ORG", "", "", "115.21", "WGS84"], + ["R3", "site2", "ORG", "", "-38.94", "115.21", "WGS84"], ], - default_map={"site1": "something"}, - expected_error_codes={"row-constraint"}, + default_map={ + ( + models.identifier.SiteIdentifier(site_id="site1", site_id_source="ORG", existing_bdr_site_iri=None) + ): "something", + }, + expected_error_codes=["row-constraint"], ), Scenario( name="invalid_survey_occurrence_missing_lat", raws=[ - ["site1", "-38.94", "115.21", "WGS84", "", "", "", ""], - ["", "", "115.21", "WGS84", "", "", "", ""], - ["site2", "-38.94", "115.21", "WGS84", "", "", "", ""], + ["R1", "site1", "ORG", "", "-38.94", "115.21", "WGS84"], + ["R2", "", "", "", "", "115.21", "WGS84"], + ["R3", "site2", "ORG", "", "-38.94", "115.21", "WGS84"], ], default_map={}, - expected_error_codes={"row-constraint"}, + expected_error_codes=["row-constraint", "row-constraint"], ), Scenario( name="invalid_survey_occurrence_missing_long", raws=[ - ["site1", "-38.94", "115.21", "WGS84", "", "", "", ""], - ["", "-38.94", "", "WGS84", "", "", "", ""], - ["site2", "-38.94", "115.21", "WGS84", "", "", "", ""], + ["R1", "site1", "ORG", "", "-38.94", "115.21", "WGS84"], + ["R2", "", "", "", "-38.94", "", "WGS84"], + ["R3", "site2", "ORG", "", "-38.94", "115.21", "WGS84"], ], default_map={}, - expected_error_codes={"row-constraint"}, + expected_error_codes=["row-constraint"], ), Scenario( name="invalid_missing_geodetic_datum", raws=[ - ["site1", "-38.94", "115.21", "WGS84", "", "", "", ""], - ["site1", "-38.94", "115.21", "", "", "", "", ""], - ["site2", "-38.94", "115.21", "WGS84", "", "", "", ""], + ["R1", "site1", "ORG", "", "-38.94", "115.21", "WGS84"], + ["R2", "site1", "ORG", "", "-38.94", "115.21", ""], + ["R3", "site2", "ORG", "", "-38.94", "115.21", "WGS84"], ], - default_map={"site1": "something"}, - expected_error_codes={"row-constraint"}, + default_map={ + ( + models.identifier.SiteIdentifier(site_id="site1", site_id_source="ORG", existing_bdr_site_iri=None) + ): "something", + }, + expected_error_codes=["row-constraint"], ), Scenario( name="invalid_survey_occurrence_missing_geodetic_datum", raws=[ - ["site1", "-38.94", "115.21", "WGS84", "", "", "", ""], - ["", "-38.94", "115.21", "", "", "", "", ""], - ["site2", "-38.94", "115.21", "WGS84", "", "", "", ""], + ["R1", "site1", "ORG", "", "-38.94", "115.21", "WGS84"], + ["R2", "", "", "", "-38.94", "115.21", ""], + ["R3", "site2", "ORG", "", "-38.94", "115.21", "WGS84"], ], default_map={}, - expected_error_codes={"row-constraint"}, + expected_error_codes=["row-constraint"], ), ] @@ -212,14 +230,13 @@ def test_apply_validation(self, scenario: Scenario, mocker: pytest_mock.MockerFi """ # Construct fake data rawh = [ + "providerRecordID", "siteID", + "siteIDSource", + "existingBDRSiteIRI", "decimalLatitude", "decimalLongitude", "geodeticDatum", - "organismQuantity", - "organismQuantityType", - "threatStatus", - "conservationAuthority", ] all_raw = [{hname: val for hname, val in zip(rawh, ln, strict=True)} for ln in scenario.raws] @@ -247,10 +264,13 @@ def test_apply_validation(self, scenario: Scenario, mocker: pytest_mock.MockerFi ) # Assert - assert report.valid == (scenario.expected_error_codes == set()) - if not report.valid: - error_codes = [code for codes in report.flatten(["type"]) for code in codes] - assert set(error_codes) == scenario.expected_error_codes + if scenario.expected_error_codes is None: + assert report.valid + else: + assert not report.valid + assert len(report.tasks) == 1 + error_codes = [error.type for error in report.tasks[0].errors] + assert error_codes == scenario.expected_error_codes def test_apply_mapping(self, mapper: Mapper) -> None: """Tests apply_mapping method with default geometry map. @@ -335,20 +355,20 @@ class Scenario: Scenario( name="valid_with_default_map", raws=[ - ["SV1", "S1", "2024-10-16"], - ["SV2", "S1", ""], - ["SV3", "S1", "2024-10-16T15:15:15+0800"], - ["SV4", "S1", ""], + ["SV1", "S1", "ORG", "2024-10-16"], + ["SV2", "S1", "ORG", ""], + ["SV3", "S1", "ORG", "2024-10-16T15:15:15+0800"], + ["SV4", "S1", "ORG", ""], ], default_map={"SV2": "some rdf", "SV4": "some rdf"}, ), Scenario( name="invalid_with_default_map", raws=[ - ["SV1", "S1", "2024-10-16"], - ["SV2", "S1", ""], - ["SV3", "S1", "2024-10-16T15:15:15+0800"], - ["SV4", "S1", ""], + ["SV1", "S1", "ORG", "2024-10-16"], + ["SV2", "S1", "ORG", ""], + ["SV3", "S1", "ORG", "2024-10-16T15:15:15+0800"], + ["SV4", "S1", "ORG", ""], ], default_map={"SV2": "some rdf"}, expected_error_codes={"row-constraint"}, @@ -372,6 +392,7 @@ def test_apply_validation(self, scenario: Scenario, mocker: pytest_mock.MockerFi rawh = [ "siteVisitID", "siteID", + "siteIDSource", "eventDateStart", ] all_raw = [{hname: val for hname, val in zip(rawh, ln, strict=True)} for ln in scenario.raws] @@ -446,32 +467,53 @@ class Scenario: """Dataclass to hold the scenario parameters.""" name: str - raws: list[list[str]] - expected_error_codes: set[str] = set() - lookup_map: dict[str, str] + raws: list[list[str | None]] + expected_error_codes: list[str] | None + lookup_map: dict[str, models.identifier.SiteIdentifier | None] scenarios: list[Scenario] = [ Scenario( name="valid_with_default_map", raws=[ - ["SV1", "S1"], - ["SV2", "S1"], - ["SV3", "S1"], - ["SV4", "S1"], - ["", "S1"], + ["SV1", "S1", "ORG", None], + ["SV2", "S2", "ORG", None], + ["SV3", None, None, "https://linked.data.gov.au/dataset/bdr/site/ORG/S3"], ], - lookup_map={"SV1": "S1", "SV2": "S1", "SV3": "S1", "SV4": "S1"}, + lookup_map={ + "SV1": None, + "SV2": models.identifier.SiteIdentifier( + site_id="S2", + site_id_source="ORG", + existing_bdr_site_iri=None, + ), + "SV3": models.identifier.SiteIdentifier( + site_id=None, + site_id_source=None, + existing_bdr_site_iri="https://linked.data.gov.au/dataset/bdr/site/ORG/S3", + ), + }, + expected_error_codes=None, ), Scenario( name="invalid_with_default_map", raws=[ - ["SV1", "S1"], - ["SV2", "S1"], - ["SV3", "S1"], - ["SV4", "S1"], + ["SV1", "S1", "ORG", None], + ["SV2", "S1", "ORG", None], + ["SV3", None, None, "https://linked.data.gov.au/dataset/bdr/site/ORG/S1"], ], - lookup_map={"SV2": "S2"}, - expected_error_codes={"row-constraint"}, + lookup_map={ + "SV2": models.identifier.SiteIdentifier( + site_id="S2", + site_id_source="ORG", + existing_bdr_site_iri=None, + ), + "SV3": models.identifier.SiteIdentifier( + site_id=None, + site_id_source=None, + existing_bdr_site_iri="https://linked.data.gov.au/dataset/bdr/site/ORG/S3", + ), + }, + expected_error_codes=["constraint-error", "row-constraint", "row-constraint"], ), ] @@ -492,6 +534,8 @@ def test_apply_validation(self, scenario: Scenario, mocker: pytest_mock.MockerFi rawh = [ "siteVisitID", "siteID", + "siteIDSource", + "existingBDRSiteIRI", ] all_raw = [{hname: val for hname, val in zip(rawh, ln, strict=True)} for ln in scenario.raws] @@ -517,10 +561,13 @@ def test_apply_validation(self, scenario: Scenario, mocker: pytest_mock.MockerFi ) # Assert - assert report.valid == (scenario.expected_error_codes == set()) - if not report.valid: - error_codes = [code for codes in report.flatten(["type"]) for code in codes] - assert set(error_codes) == scenario.expected_error_codes + if scenario.expected_error_codes is None: + assert report.valid + else: + assert not report.valid + assert len(report.tasks) == 1 + error_codes = [error.type for error in report.tasks[0].errors] + assert error_codes == scenario.expected_error_codes def test_extract_site_id_keys( @@ -533,14 +580,26 @@ def test_extract_site_id_keys( mocker (pytest_mock.MockerFixture): The mocker fixture. """ # Construct a raw data set only using fields relevant to method. - rawh = ["siteID"] - raws = [["site1"], [""], ["site2"], ["site3"], ["site3"]] + rawh = ["siteID", "siteIDSource", "existingBDRSiteIRI"] + raws = [ + ["site1", "ORG", ""], + ["", "", ""], + ["site2", "ORG", ""], + ["site2", "ORG", ""], + ["", "", "SITE-IRI"], + ] # Amalgamate into a list of dicts all_raw = [{hname: val for hname, val in zip(rawh, ln, strict=True)} for ln in raws] # Modify schema to only include the necessary fields - descriptor = {"fields": [{"name": "siteID", "type": "string"}]} + descriptor = { + "fields": [ + {"name": "siteID", "type": "string"}, + {"name": "siteIDSource", "type": "string"}, + {"name": "existingBDRSiteIRI", "type": "string"}, + ] + } mocker.patch.object(base.mapper.ABISMapper, "schema").return_value = descriptor # Create raw data csv string @@ -552,9 +611,9 @@ def test_extract_site_id_keys( csv_data = output.getvalue().encode("utf-8") expected = { - "site1": True, - "site2": True, - "site3": True, + models.identifier.SiteIdentifier(site_id="site1", site_id_source="ORG", existing_bdr_site_iri=None): True, + models.identifier.SiteIdentifier(site_id="site2", site_id_source="ORG", existing_bdr_site_iri=None): True, + models.identifier.SiteIdentifier(site_id=None, site_id_source=None, existing_bdr_site_iri="SITE-IRI"): True, } # Invoke method diff --git a/tests/templates/test_survey_site_data_v3.py b/tests/templates/test_survey_site_data_v3.py index 402950d9..b554543f 100644 --- a/tests/templates/test_survey_site_data_v3.py +++ b/tests/templates/test_survey_site_data_v3.py @@ -13,6 +13,7 @@ # Local from abis_mapping import base +from abis_mapping import models import abis_mapping.templates.survey_site_data_v3.mapping # Typing @@ -28,20 +29,31 @@ def test_extract_geometry_defaults( mocker: The mocker fixture. """ # Construct a dummy raw data set using only the fields that matter to the method. - rawh = ["siteID", "footprintWKT", "decimalLongitude", "decimalLatitude", "geodeticDatum"] + rawh = [ + "siteID", + "siteIDSource", + "existingBDRSiteIRI", + "footprintWKT", + "decimalLongitude", + "decimalLatitude", + "geodeticDatum", + ] raws = [ - ["site1", "POLYGON((0 0, 0 5, 5 5, 5 0, 0 0))", "", "", "WGS84"], - ["site2", "POLYGON((0 0, 0 5, 5 5, 5 0, 0 0))", "10.0", "20.0", "WGS84"], - ["site3", "", "10.0", "20.0", "WGS84"], - ["site4", "", "", "", ""], - ["site5", "", "10.0", "20.0", ""], - ["site6", "POLYGON((0 0, 0 5, 5 5, 5 0, 0 0))", "", "", ""], - ["site7", "", "10.0", "20.0", "AGD66"], - ["site8", "", "11.0", "21.0", "EPSG:4202"], - ["site9", "", "12.0", "22.0", "GRS20"], - # rows with missing siteID should not be included in map - ["", "POLYGON((0 0, 0 5, 5 5, 5 0, 0 0))", "", "", "WGS84"], - ["", "", "10.0", "20.0", "WGS84"], + ["site1", "ORG", "", "POLYGON((0 0, 0 5, 5 5, 5 0, 0 0))", "", "", "WGS84"], + ["site2", "ORG", "", "POLYGON((0 0, 0 5, 5 5, 5 0, 0 0))", "10.0", "20.0", "WGS84"], + ["site3", "ORG", "", "", "10.0", "20.0", "WGS84"], + ["site4", "ORG", "", "", "10.0", "20.0", "AGD66"], + ["site5", "ORG", "", "", "11.0", "21.0", "EPSG:4202"], + ["", "", "SITE-IRI", "", "15.0", "25.0", "WGS84"], + # rows with no datum will be omitted from map + ["site7", "ORG", "", "", "", "", ""], + ["site8", "ORG", "", "", "10.0", "20.0", ""], + ["site9", "ORG", "", "POLYGON((0 0, 0 5, 5 5, 5 0, 0 0))", "", "", ""], + # Row with invalid datum will be omitted from map + ["site10", "ORG", "", "", "12.0", "22.0", "GRS20"], + # rows with missing site identifier should not be included in map + ["", "", "", "POLYGON((0 0, 0 5, 5 5, 5 0, 0 0))", "", "", "WGS84"], + ["", "", "", "", "10.0", "20.0", "WGS84"], ] # Amalgamate into a list of dicts all_raw = [{hname: val for hname, val in zip(rawh, ln, strict=True)} for ln in raws] @@ -53,6 +65,8 @@ def test_extract_geometry_defaults( descriptor = { "fields": [ {"name": "siteID", "type": "string"}, + {"name": "siteIDSource", "type": "string"}, + {"name": "existingBDRSiteIRI", "type": "string"}, {"name": "footprintWKT", "type": "wkt"}, {"name": "decimalLongitude", "type": "number"}, {"name": "decimalLatitude", "type": "number"}, @@ -70,11 +84,24 @@ def test_extract_geometry_defaults( csv_data = output.getvalue().encode("utf-8") expected = { - "site1": " POINT (2.5 2.5)", - "site2": " POINT (2.5 2.5)", - "site3": " POINT (20 10)", - "site7": " POINT (20 10)", - "site8": " POINT (21 11)", + ( + models.identifier.SiteIdentifier(site_id="site1", site_id_source="ORG", existing_bdr_site_iri=None) + ): " POINT (2.5 2.5)", + ( + models.identifier.SiteIdentifier(site_id="site2", site_id_source="ORG", existing_bdr_site_iri=None) + ): " POINT (2.5 2.5)", + ( + models.identifier.SiteIdentifier(site_id="site3", site_id_source="ORG", existing_bdr_site_iri=None) + ): " POINT (20 10)", + ( + models.identifier.SiteIdentifier(site_id="site4", site_id_source="ORG", existing_bdr_site_iri=None) + ): " POINT (20 10)", + ( + models.identifier.SiteIdentifier(site_id="site5", site_id_source="ORG", existing_bdr_site_iri=None) + ): " POINT (21 11)", + ( + models.identifier.SiteIdentifier(site_id=None, site_id_source=None, existing_bdr_site_iri="SITE-IRI") + ): " POINT (25 15)", } # Invoke method assert hasattr(mapper, "extract_geometry_defaults") @@ -91,35 +118,38 @@ class Scenario: name: str raws: list[list[str]] - site_id_map: dict[str, bool] - expected_error_codes: set[str] = set() + site_id_map: dict[models.identifier.SiteIdentifier, bool] + expected_error_codes: list[str] | None scenarios: list[Scenario] = [ Scenario( name="valid_with_site_id_map", raws=[ - ["site1", "-38.94", "115.21", "POINT(30 10)", "WGS84"], - ["site2", "-38.94", "115.21", "", "GDA2020"], - ["site3", "", "", "LINESTRING(30 10, 10 30, 40 40)", "GDA94"], - ["site4", "", "", "", ""], - # Following shows that non-url safe siteIDs get endcoded when mapping. - ["site a", "-38.94", "115.21", "", "GDA2020"], - ["site/b", "-38.94", "115.21", "", "GDA2020"], - [r"site\c", "-38.94", "115.21", "", "GDA2020"], - ["site\nd", "-38.94", "115.21", "", "GDA2020"], + # has geometry + ["site1", "ORG", "", "-38.94", "115.21", "POINT(30 10)", "WGS84"], + ["site2", "ORG", "", "-38.94", "115.21", "", "GDA2020"], + ["site3", "ORG", "", "", "", "LINESTRING(30 10, 10 30, 40 40)", "GDA94"], + # missing geometry, but has a Site in the map. + ["site4", "ORG", "", "", "", "", ""], + ["", "", "https://example.com/EXISTING-SITE-IRI", "", "", "", ""], ], site_id_map={ - "site4": True, - "siteNone": True, + models.identifier.SiteIdentifier( + site_id="site4", site_id_source="ORG", existing_bdr_site_iri=None + ): True, + models.identifier.SiteIdentifier( + site_id=None, site_id_source=None, existing_bdr_site_iri="https://example.com/EXISTING-SITE-IRI" + ): True, }, + expected_error_codes=None, ), Scenario( name="invalid_missing_geometry_and_not_in_map", raws=[ - ["site1", "", "", "", ""], + ["site1", "ORG", "", "", "", "", ""], ], - site_id_map={"site2": True}, - expected_error_codes={"row-constraint"}, + site_id_map={}, + expected_error_codes=["row-constraint"], ), ] @@ -136,7 +166,15 @@ def test_apply_validation(self, scenario: Scenario, mocker: pytest_mock.MockerFi mocker (pytest_mock.MockerFixture): The mocker fixture. """ # Construct fake data - rawh = ["siteID", "decimalLatitude", "decimalLongitude", "footprintWKT", "geodeticDatum"] + rawh = [ + "siteID", + "siteIDSource", + "existingBDRSiteIRI", + "decimalLatitude", + "decimalLongitude", + "footprintWKT", + "geodeticDatum", + ] all_raw = [{hname: val for hname, val in zip(rawh, ln, strict=True)} for ln in scenario.raws] # Get mapper @@ -164,10 +202,13 @@ def test_apply_validation(self, scenario: Scenario, mocker: pytest_mock.MockerFi ) # Assert - assert report.valid == (scenario.expected_error_codes == set()) - if not report.valid: - error_codes = [code for codes in report.flatten(["type"]) for code in codes] - assert set(error_codes) == scenario.expected_error_codes + if scenario.expected_error_codes is None: + assert report.valid + else: + assert not report.valid + assert len(report.tasks) == 1 + error_codes = [error.type for error in report.tasks[0].errors] + assert error_codes == scenario.expected_error_codes @pytest.mark.parametrize( diff --git a/tests/templates/test_survey_site_visit_data_v3.py b/tests/templates/test_survey_site_visit_data_v3.py index d7003d59..92dfc3d6 100644 --- a/tests/templates/test_survey_site_visit_data_v3.py +++ b/tests/templates/test_survey_site_visit_data_v3.py @@ -119,6 +119,7 @@ def test_extract_temporal_defaults( descriptor = { **original_descriptor, "fields": [f for f in original_descriptor["fields"] if f["name"] in fieldnames], + "foreignKeys": [], # remove FKs that reference fields not included } # Patch schema @@ -205,7 +206,7 @@ def test_extract_site_visit_id_to_site_id_map( original_descriptor = mapping.SurveySiteVisitMapper.schema() # Define fields of relevance for tests - fieldnames = ["surveyID", "siteID", "siteVisitID"] + fieldnames = ["surveyID", "siteID", "siteIDSource", "existingBDRSiteIRI", "siteVisitID"] # Make descriptor only include these fields descriptor = { @@ -217,29 +218,34 @@ def test_extract_site_visit_id_to_site_id_map( mocked_schema = mocker.patch.object(mapping.SurveySiteVisitMapper, "schema", return_value=descriptor) # Declare some raw data - expected_rows: list[dict[str, str | None]] = [ + rows: list[dict[str, str | None]] = [ + # rows included in the map: { "surveyID": "A", "siteID": "S1", + "siteIDSource": "ORG", + "existingBDRSiteIRI": None, "siteVisitID": "SV1", }, { "surveyID": "A", - "siteID": "S1", + "siteID": None, + "siteIDSource": None, + "existingBDRSiteIRI": "https://linked.data.gov.au/dataset/bdr/site/ORG/S2", "siteVisitID": "SV2", }, - ] - excluded_rows: list[dict[str, str | None]] = [ - # The map should exclude these since there are no - # values for siteID { "surveyID": "A", - "siteID": "", + "siteID": "S3", + "siteIDSource": "ORG", + "existingBDRSiteIRI": "https://linked.data.gov.au/dataset/bdr/site/ORG/S3", "siteVisitID": "SV3", }, { "surveyID": "A", "siteID": None, + "siteIDSource": None, + "existingBDRSiteIRI": None, "siteVisitID": "SV4", }, # map should exclude these because there is no siteVisitID @@ -255,13 +261,30 @@ def test_extract_site_visit_id_to_site_id_map( }, ] # Construct expected map - expected = {r["siteVisitID"]: r["siteID"] for r in expected_rows} + expected: dict[str, models.identifier.SiteIdentifier | None] = { + "SV1": models.identifier.SiteIdentifier( + site_id="S1", + site_id_source="ORG", + existing_bdr_site_iri=None, + ), + "SV2": models.identifier.SiteIdentifier( + site_id=None, + site_id_source=None, + existing_bdr_site_iri="https://linked.data.gov.au/dataset/bdr/site/ORG/S2", + ), + "SV3": models.identifier.SiteIdentifier( + site_id=None, + site_id_source=None, + existing_bdr_site_iri="https://linked.data.gov.au/dataset/bdr/site/ORG/S3", + ), + "SV4": None, + } # Create raw data csv string output = io.StringIO() - csv_writer = csv.DictWriter(output, fieldnames=expected_rows[0].keys()) + csv_writer = csv.DictWriter(output, fieldnames=fieldnames) csv_writer.writeheader() - for row in expected_rows + excluded_rows: + for row in rows: csv_writer.writerow(row) csv_data = output.getvalue().encode("utf-8")