Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More MITAardvark methods #110

Merged
merged 4 commits into from
Jan 5, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/fixtures/aardvark/aardvark_record_all_fields.jsonl
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"id": "123", "dcat_bbox": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_accessRights_s": "Access note", "dct_alternative_sm": ["Alternate title"], "dct_creator_sm": ["Smith, Jane", "Smith, John"], "dct_description_sm": ["A description"], "dct_format_s": "Shapefile", "dct_language_sm": ["eng"], "dct_license_sm": ["http://license.license", "http://another_license.another_license"], "dct_publisher_sm": ["ML InfoMap (Firm)"], "dct_rights_sm": ["Some person has the rights"], "dct_rightsHolder_sm": ["The person with the rights", "Another person with the rights"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "dct_title_s": "Test title 1", "gbl_displayNote_sm": ["Danger: This text will be displayed in a red box","Info: This text will be displayed in a blue box","Tip: This text will be displayed in a green box","Warning: This text will be displayed in a yellow box","This is text without a tag and it will be assigned default 'note' style"], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "locn_geometry": "POLYGON((-80 25, -65 18, -64 33, -80 25))", "schema_provider_s": "MIT"}
{"id": "mit:123", "dcat_bbox": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_accessRights_s": "Access note", "dct_alternative_sm": ["Alternate title"], "dct_creator_sm": ["Smith, Jane", "Smith, John"], "dct_description_sm": ["A description"], "dct_format_s": "Shapefile", "dct_identifier_sm": ["abc123"], "dct_issued_s": "2003-10-23", "dct_language_sm": ["eng"], "dct_license_sm": ["http://license.license", "http://another_license.another_license"], "dct_publisher_sm": ["ML InfoMap (Firm)"], "dct_references_s": "{\"https://schema.org/downloadUrl\": [{\"label\": \"Source Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.source.fgdc.xml\"}, {\"label\": \"Normalized Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.normalized.aardvark.json\"}, {\"label\": \"Data Zipfile\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.zip\"}]}", "dct_rights_sm": ["Some person has the rights"], "dct_rightsHolder_sm": ["The person with the rights", "Another person with the rights"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "dct_temporal_sm": ["1943", "1979"], "dct_title_s": "Test title 1", "gbl_dateRange_drsim": ["[1943 TO 1946]"], "gbl_displayNote_sm": ["Danger: This text will be displayed in a red box","Info: This text will be displayed in a blue box","Tip: This text will be displayed in a green box","Warning: This text will be displayed in a yellow box","This is text without a tag and it will be assigned default 'note' style"], "gbl_indexYear_im": [1943,1944,1945,1946], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "schema_provider_s": "MIT"}
4 changes: 2 additions & 2 deletions tests/fixtures/aardvark_records.jsonl
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
{"id": "123", "dct_title_s": "Test title 1"}
{"id": "456", "dct_title_s": "Test title 2"}
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 1", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "id": "mit:123", "locn_geometry": ""}
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 2", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "id": "ogm:456", "locn_geometry": ""}
73 changes: 73 additions & 0 deletions tests/sources/json/test_aardvark.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pytest

import transmogrifier.models as timdex
from transmogrifier.sources.json.aardvark import MITAardvark

Expand All @@ -21,6 +23,7 @@ def test_aardvark_transform_returns_timdex_record(aardvark_records):
title="Test title 1",
citation="Test title 1. Geospatial data. https://example.com/123",
content_type=["Geospatial data"],
rights=[timdex.Rights(description="Access rights", kind="Access")],
)


Expand Down Expand Up @@ -63,6 +66,76 @@ def test_aardvark_get_contributors_success(aardvark_record_all_fields):
]


def test_aardvark_get_dates_success(aardvark_record_all_fields):
assert MITAardvark.get_dates(next(aardvark_record_all_fields), "123") == [
timdex.Date(kind="Issued", value="2003-10-23"),
timdex.Date(kind="Coverage", value="1943"),
timdex.Date(kind="Coverage", value="1979"),
timdex.Date(kind="Coverage", value="1944"),
timdex.Date(kind="Coverage", value="1945"),
timdex.Date(kind="Coverage", value="1946"),
timdex.Date(
range=timdex.Date_Range(gte="1943", lte="1946"),
),
]


def test_aardvark_parse_solr_date_range_string_success():
assert MITAardvark.parse_solr_date_range_string("[1932 TO 1937]", "123") == (
"1932",
"1937",
)


def test_parse_solr_date_range_invalid_date_range_string_raises_error():
with pytest.raises(
ValueError,
match="Record ID '123': Unable to parse date range string 'Invalid'",
):
MITAardvark.parse_solr_date_range_string("Invalid", "123")


def test_aardvark_get_identifiers_success(aardvark_record_all_fields):
assert MITAardvark.get_identifiers(next(aardvark_record_all_fields)) == [
timdex.Identifier(value="abc123")
]


def test_aardvark_get_links_success(aardvark_record_all_fields):
assert MITAardvark.get_links(next(aardvark_record_all_fields), "123") == [
timdex.Link(
url="https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.source.fgdc.xml",
kind="Download",
text="Source Metadata",
),
timdex.Link(
url="https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95."
"normalized.aardvark.json",
kind="Download",
text="Normalized Metadata",
),
timdex.Link(
url="https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.zip",
kind="Download",
text="Data Zipfile",
),
]


def test_aardvark_get_links_logs_warning_for_invalid_json(caplog):
assert MITAardvark.get_links({"dct_references_s": "Invalid"}, "123") == []
assert (
"Record ID '123': Unable to parse links string 'Invalid' as JSON" in caplog.text
)


def test_aardvark_get_locations_success(aardvark_record_all_fields):
assert MITAardvark.get_locations(next(aardvark_record_all_fields), "123") == [
timdex.Location(kind="Bounding Box", geodata=[-111.1, -104.0, 45.0, 40.9]),
timdex.Location(kind="Geometry", geodata=[-111.1, -104.0, 45.0, 40.9]),
]


def test_aardvark_get_notes_success(aardvark_record_all_fields):
assert MITAardvark.get_notes(next(aardvark_record_all_fields)) == [
timdex.Note(
Expand Down
20 changes: 20 additions & 0 deletions tests/test_helpers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from datetime import datetime

import pytest

import transmogrifier.models as timdex
from transmogrifier.helpers import (
generate_citation,
parse_date_from_string,
parse_geodata_string,
validate_date,
validate_date_range,
)
Expand Down Expand Up @@ -256,6 +259,23 @@ def test_parse_date_from_string_invalid_date_returns_none():
assert parse_date_from_string("circa 1930s") is None


def test_parse_geodata_string_success():
assert parse_geodata_string("ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "123") == [
-111.1,
-104.0,
45.0,
40.9,
]


def test_parse_geodata_string_invalid_geodata_string_raises_error():
with pytest.raises(
ValueError,
match="Record ID '123': Unable to parse geodata string 'Invalid'",
):
parse_geodata_string("Invalid", "123")


def test_validate_date_success():
assert validate_date("1930", "1234") is True

Expand Down
10 changes: 10 additions & 0 deletions transmogrifier/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,16 @@
"base-url": "https://libguides.mit.edu/",
"transform-class": "transmogrifier.sources.xml.springshare.SpringshareOaiDc",
},
"gismit": {
"name": "MIT GIS Resources",
"base-url": "https://search.libraries.mit.edu/record/",
"transform-class": "transmogrifier.sources.json.aardvark.MITAardvark",
},
"gisogm": {
"name": "OpenGeoMetadata GIS Resources",
"base-url": "https://search.libraries.mit.edu/record/",
"transform-class": "transmogrifier.sources.json.aardvark.OGMAardvark",
},
Comment on lines +106 to +115
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking good!

"researchdatabases": {
"name": "Research Databases",
"base-url": "https://libguides.mit.edu/",
Expand Down
25 changes: 25 additions & 0 deletions transmogrifier/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,31 @@ def parse_date_from_string(
return None


def parse_geodata_string(geodata_string: str, source_record_id: str) -> list[float]:
ehanson8 marked this conversation as resolved.
Show resolved Hide resolved
"""Get list of values from a formatted geodata string.

Example:
- "ENVELOPE(-111.1, -104.0, 45.0, 40.9)"

Args:
geodata_string: Formatted geodata string to parse.
source_record_id: The ID of the record containing the string to parse.
"""
geodata_points = []
try:
raw_geodata_points = geodata_string.split("(")[-1].split(")")[0].split(",")
stripped_geodata_points = map(str.strip, raw_geodata_points)
geodata_floats = list(map(float, stripped_geodata_points))
ehanson8 marked this conversation as resolved.
Show resolved Hide resolved
geodata_points.extend(geodata_floats)
except ValueError:
message = (
f"Record ID '{source_record_id}': "
f"Unable to parse geodata string '{geodata_string}'"
)
raise ValueError(message)
return geodata_points


def validate_date(
date_string: str,
source_record_id: str,
Expand Down
2 changes: 1 addition & 1 deletion transmogrifier/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ class Link:

@define
class Location:
value: Optional[str] = field(validator=optional(instance_of(str)))
value: Optional[str] = field(default=None, validator=optional(instance_of(str)))
kind: Optional[str] = field(default=None, validator=optional(instance_of(str)))
geodata: Optional[list[float]] = field(
default=None, validator=optional(list_of(float))
Expand Down
Loading
Loading