Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gdt 116 update locations #113

Merged
merged 5 commits into from
Jan 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
706 changes: 172 additions & 534 deletions Pipfile.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def timdex_record_all_fields_and_subfields():
timdex.Location(
value="A point on the globe",
kind="Data was gathered here",
geodata=[-77.025955, 38.942142],
geoshape="BBOX(-77.025955, 38.942142)",
ehanson8 marked this conversation as resolved.
Show resolved Hide resolved
)
],
notes=[timdex.Note(value=["This book is awesome"], kind="opinion")],
Expand Down
2 changes: 1 addition & 1 deletion tests/fixtures/aardvark/aardvark_record_all_fields.jsonl
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"id": "mit:123", "dcat_bbox": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_accessRights_s": "Access note", "dct_alternative_sm": ["Alternate title"], "dct_creator_sm": ["Smith, Jane", "Smith, John"], "dct_description_sm": ["A description"], "dct_format_s": "Shapefile", "dct_identifier_sm": ["abc123"], "dct_issued_s": "2003-10-23", "dct_language_sm": ["eng"], "dct_license_sm": ["http://license.license", "http://another_license.another_license"], "dct_publisher_sm": ["ML InfoMap (Firm)"], "dct_references_s": "{\"https://schema.org/downloadUrl\": [{\"label\": \"Source Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.source.fgdc.xml\"}, {\"label\": \"Normalized Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.normalized.aardvark.json\"}, {\"label\": \"Data Zipfile\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.zip\"}]}", "dct_rights_sm": ["Some person has the rights"], "dct_rightsHolder_sm": ["The person with the rights", "Another person with the rights"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "dct_temporal_sm": ["1943", "1979"], "dct_title_s": "Test title 1", "gbl_dateRange_drsim": ["[1943 TO 1946]"], "gbl_displayNote_sm": ["Danger: This text will be displayed in a red box","Info: This text will be displayed in a blue box","Tip: This text will be displayed in a green box","Warning: This text will be displayed in a yellow box","This is text without a tag and it will be assigned default 'note' style"], "gbl_indexYear_im": [1943,1944,1945,1946], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "schema_provider_s": "MIT"}
{"id": "mit:123", "dcat_bbox": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_accessRights_s": "Access note", "dct_alternative_sm": ["Alternate title"], "dct_creator_sm": ["Smith, Jane", "Smith, John"], "dct_description_sm": ["A description"], "dct_format_s": "Shapefile", "dct_identifier_sm": ["abc123"], "dct_issued_s": "2003-10-23", "dct_language_sm": ["eng"], "dct_license_sm": ["http://license.license", "http://another_license.another_license"], "dct_publisher_sm": ["ML InfoMap (Firm)"], "dct_references_s": "{\"https://schema.org/downloadUrl\": [{\"label\": \"Source Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.source.fgdc.xml\"}, {\"label\": \"Normalized Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.normalized.aardvark.json\"}, {\"label\": \"Data Zipfile\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.zip\"}]}", "dct_rights_sm": ["Some person has the rights"], "dct_rightsHolder_sm": ["The person with the rights", "Another person with the rights"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "dct_temporal_sm": ["1943", "1979"], "dct_title_s": "Test title 1", "gbl_dateRange_drsim": ["[1943 TO 1946]"], "gbl_displayNote_sm": ["Danger: This text will be displayed in a red box","Info: This text will be displayed in a blue box","Tip: This text will be displayed in a green box","Warning: This text will be displayed in a yellow box","This is text without a tag and it will be assigned default 'note' style"], "gbl_indexYear_im": [1943,1944,1945,1946], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "gbl_suppressed_b": false, "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "schema_provider_s": "MIT"}
5 changes: 3 additions & 2 deletions tests/fixtures/aardvark_records.jsonl
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 1", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "id": "mit:123", "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)"}
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 2", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "id": "ogm:456", "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)"}
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 1", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": false, "id": "mit:123", "locn_geometry": ""}
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 2", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": false, "id": "ogm:456", "locn_geometry": ""}
{"dct_accessRights_s": "Access rights", "dct_references_s": "", "dct_title_s": "Test title 3", "gbl_mdModified_dt": "", "gbl_mdVersion_s": "", "gbl_resourceClass_sm": "", "gbl_suppressed_b": true, "id": "ogm:789", "locn_geometry": ""}
94 changes: 85 additions & 9 deletions tests/sources/json/test_aardvark.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,33 @@
from pathlib import Path

import pytest

import transmogrifier.models as timdex
from transmogrifier.sources.json.aardvark import MITAardvark


def test_mitaardvark_transform_and_write_output_files_writes_output_files(
tmp_path, aardvark_records
):
output_file = str(tmp_path / "output_file.json")
transformer = MITAardvark("cool-repo", aardvark_records)
assert not Path(tmp_path / "output_file.json").exists()
assert not Path(tmp_path / "output_file.txt").exists()
transformer.transform_and_write_output_files(output_file)
assert Path(tmp_path / "output_file.json").exists()
assert Path(tmp_path / "output_file.txt").exists()


def test_mitaardvark_transform_and_write_output_files_no_txt_file_if_not_needed(
tmp_path, aardvark_record_all_fields
):
output_file = str(tmp_path / "output_file.json")
transformer = MITAardvark("cool-repo", aardvark_record_all_fields)
transformer.transform_and_write_output_files(output_file)
assert len(list(tmp_path.iterdir())) == 1
assert next(tmp_path.iterdir()).name == "output_file.json"


def test_aardvark_get_required_fields_returns_expected_values(aardvark_records):
transformer = MITAardvark("cool-repo", aardvark_records)
assert transformer.get_required_fields(next(aardvark_records)) == {
Expand Down Expand Up @@ -43,6 +67,38 @@ def test_aardvark_get_main_titles_success(aardvark_record_all_fields):
]


def test_aardvark_record_is_deleted_returns_false_if_field_missing(
aardvark_record_all_fields,
):
assert MITAardvark.record_is_deleted(next(aardvark_record_all_fields)) is False


def test_aardvark_record_is_deleted_raises_error_if_value_is_string(
aardvark_record_all_fields,
):
with pytest.raises(
ValueError,
match="Record ID '123': 'gbl_suppressed_b' value is not a boolean",
):
aardvark_record = next(aardvark_record_all_fields)
aardvark_record["gbl_suppressed_b"] = "True"
MITAardvark.record_is_deleted(aardvark_record)


def test_aardvark_record_is_deleted_returns_false_if_value_is_false(
aardvark_record_all_fields,
):
aardvark_record = next(aardvark_record_all_fields)
aardvark_record["gbl_suppressed_b"] = False
assert MITAardvark.record_is_deleted(aardvark_record) is False


def test_aardvark_record_is_deleted_success(aardvark_record_all_fields):
aardvark_record = next(aardvark_record_all_fields)
aardvark_record["gbl_suppressed_b"] = True
assert MITAardvark.record_is_deleted(aardvark_record) is True


def test_aardvark_get_source_record_id_success(aardvark_record_all_fields):
assert MITAardvark.get_source_record_id(next(aardvark_record_all_fields)) == "123"

Expand Down Expand Up @@ -130,11 +186,30 @@ def test_aardvark_get_links_logs_warning_for_invalid_json(caplog):
)


def test_aardvark_get_locations_success(caplog, aardvark_record_all_fields):
caplog.set_level("DEBUG")
assert "Geometry field 'dcat_bbox' found, but currently not mapped."
assert "Geometry field 'locn_geometry' found, but currently not mapped."
assert MITAardvark.get_locations(next(aardvark_record_all_fields), "123") == []
def test_aardvark_get_locations_success(aardvark_record_all_fields):
assert MITAardvark.get_locations(next(aardvark_record_all_fields), "123") == [
timdex.Location(
kind="Bounding Box", geoshape="BBOX (-111.1, -104.0, 45.0, 40.9)"
),
timdex.Location(kind="Geometry", geoshape="BBOX (-111.1, -104.0, 45.0, 40.9)"),
]


def test_parse_get_locations_string_invalid_geostring_logs_warning(
aardvark_record_all_fields, caplog
):
aardvark_record = next(aardvark_record_all_fields)
aardvark_record["dcat_bbox"] = "Invalid"
aardvark_record["locn_geometry"] = "Invalid"
assert MITAardvark.get_locations(aardvark_record, "123") == []
assert (
"Record ID '123': Unable to parse geodata string 'Invalid' in 'dcat_bbox'"
in caplog.text
)
assert (
"Record ID '123': Unable to parse geodata string 'Invalid' in 'locn_geometry'"
in caplog.text
)


def test_aardvark_get_notes_success(aardvark_record_all_fields):
Expand Down Expand Up @@ -184,10 +259,11 @@ def test_aardvark_get_rights_success(aardvark_record_all_fields):

def test_aardvark_get_subjects_success(aardvark_record_all_fields):
assert MITAardvark.get_subjects(next(aardvark_record_all_fields)) == [
timdex.Subject(value=["Country"], kind="DCAT Keyword"),
timdex.Subject(value=["Political boundaries"], kind="DCAT Theme"),
timdex.Subject(value=["Geography"], kind="Dublin Core Subject"),
timdex.Subject(value=["Earth"], kind="Dublin Core Subject"),
timdex.Subject(value=["Country"], kind="DCAT; Keyword"),
timdex.Subject(value=["Political boundaries"], kind="DCAT; Theme"),
timdex.Subject(value=["Some city, Some country"], kind="Dublin Core; Spatial"),
timdex.Subject(value=["Geography"], kind="Dublin Core; Subject"),
timdex.Subject(value=["Earth"], kind="Dublin Core; Subject"),
Comment on lines +262 to +266
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm digging it. I've seen semicolons used for dilineating namespaces in freetext values like this. Feels like a decent middle ground.

timdex.Subject(value=["Dataset"], kind="Subject scheme not provided"),
timdex.Subject(value=["Vector data"], kind="Subject scheme not provided"),
]
20 changes: 0 additions & 20 deletions tests/test_helpers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
from datetime import datetime

import pytest

import transmogrifier.models as timdex
from transmogrifier.helpers import (
generate_citation,
parse_date_from_string,
parse_geodata_string,
validate_date,
validate_date_range,
)
Expand Down Expand Up @@ -259,23 +256,6 @@ def test_parse_date_from_string_invalid_date_returns_none():
assert parse_date_from_string("circa 1930s") is None


def test_parse_geodata_string_success():
assert parse_geodata_string("ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "123") == [
-111.1,
-104.0,
45.0,
40.9,
]


def test_parse_geodata_string_invalid_geodata_string_raises_error():
with pytest.raises(
ValueError,
match="Record ID '123': Unable to parse geodata string 'Invalid'",
):
parse_geodata_string("Invalid", "123")


def test_validate_date_success():
assert validate_date("1930", "1234") is True

Expand Down
10 changes: 5 additions & 5 deletions tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,10 +181,10 @@ def test_timdex_record_all_fields_and_subfields(timdex_record_all_fields_and_sub
timdex_record_all_fields_and_subfields.locations[0].kind
== "Data was gathered here"
)
assert timdex_record_all_fields_and_subfields.locations[0].geodata == [
-77.025955,
38.942142,
]
assert (
timdex_record_all_fields_and_subfields.locations[0].geoshape
== "BBOX(-77.025955, 38.942142)"
jonavellecuerdo marked this conversation as resolved.
Show resolved Hide resolved
)
assert timdex_record_all_fields_and_subfields.notes[0].value == [
"This book is awesome"
]
Expand Down Expand Up @@ -305,7 +305,7 @@ def test_record_asdict_includes_all_fields(timdex_record_all_fields_and_subfield
"literary_form": "nonfiction",
"locations": [
{
"geodata": [-77.025955, 38.942142],
"geoshape": "BBOX(-77.025955, 38.942142)",
jonavellecuerdo marked this conversation as resolved.
Show resolved Hide resolved
"kind": "Data was gathered here",
"value": "A point on the globe",
}
Expand Down
25 changes: 0 additions & 25 deletions transmogrifier/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,31 +75,6 @@ def parse_date_from_string(
return None


def parse_geodata_string(geodata_string: str, source_record_id: str) -> list[float]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the reason for deleting this helper function? Just want to understand. 🤓

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we're just taking the WKT string as is given than OpenSearch can handle it, we don't need this function to do the parsing that we originally though was needed. Does that make sense?

"""Get list of values from a formatted geodata string.

Example:
- "ENVELOPE(-111.1, -104.0, 45.0, 40.9)"

Args:
geodata_string: Formatted geodata string to parse.
source_record_id: The ID of the record containing the string to parse.
"""
geodata_points = []
try:
raw_geodata_points = geodata_string.split("(")[-1].split(")")[0].split(",")
stripped_geodata_points = map(str.strip, raw_geodata_points)
geodata_floats = list(map(float, stripped_geodata_points))
geodata_points.extend(geodata_floats)
except ValueError:
message = (
f"Record ID '{source_record_id}': "
f"Unable to parse geodata string '{geodata_string}'"
)
raise ValueError(message)
return geodata_points


def validate_date(
date_string: str,
source_record_id: str,
Expand Down
4 changes: 1 addition & 3 deletions transmogrifier/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,7 @@ class Link:
class Location:
value: Optional[str] = field(default=None, validator=optional(instance_of(str)))
kind: Optional[str] = field(default=None, validator=optional(instance_of(str)))
geodata: Optional[list[float]] = field(
default=None, validator=optional(list_of(float))
)
geoshape: Optional[str] = field(default=None, validator=optional(instance_of(str)))


@define
Expand Down
45 changes: 28 additions & 17 deletions transmogrifier/sources/json/aardvark.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,14 @@ def record_is_deleted(cls, source_record: dict) -> bool:
Args:
source_record: A JSON object representing a source record.
"""
return False
if isinstance(source_record["gbl_suppressed_b"], bool):
return source_record["gbl_suppressed_b"]
else:
message = (
f"Record ID '{cls.get_source_record_id(source_record)}': "
"'gbl_suppressed_b' value is not a boolean"
)
raise ValueError(message)
Comment on lines +93 to +100
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is good. If we don't know definitively if the record is deleted or not, raises an error.


def get_optional_fields(self, source_record: dict) -> dict | None:
"""
Expand Down Expand Up @@ -292,13 +299,8 @@ def get_links(source_record: dict, source_record_id: str) -> list[timdex.Link]:
def get_locations(
source_record: dict, source_record_id: str
) -> list[timdex.Location]:
"""Get values from source record for TIMDEX locations field.

WIP: Currently in the process of determining our approach for storing geographic
geometry data in the TIMDEX record and how this dovetails with the OpenSearch
mapping. At this time, this method returns an empty list of Locations.
"""
locations: list[timdex.Location] = []
"""Get values from source record for TIMDEX locations field."""
locations = []

aardvark_location_fields = {
"dcat_bbox": "Bounding Box",
Expand All @@ -307,14 +309,22 @@ def get_locations(
for aardvark_location_field, kind_value in aardvark_location_fields.items():
if aardvark_location_field not in source_record:
continue
try:
if (
geodata_string := source_record[aardvark_location_field]
) and "ENVELOPE" in source_record[aardvark_location_field]:
locations.append(
timdex.Location(
geoshape=geodata_string.replace("ENVELOPE", "BBOX "),
kind=kind_value,
)
)
else:
message = (
f"Geometry field '{aardvark_location_field}' found, but "
f"currently not mapped."
f"Record ID '{source_record_id}': "
f"Unable to parse geodata string '{geodata_string}' "
f"in '{aardvark_location_field}'"
)
logger.debug(message)
except ValueError as exception:
logger.warning(exception)
logger.warning(message)
return locations

@staticmethod
Expand Down Expand Up @@ -385,9 +395,10 @@ def get_subjects(source_record: dict) -> list[timdex.Subject]:
subjects = []

aardvark_subject_fields = {
"dcat_keyword_sm": "DCAT Keyword",
"dcat_theme_sm": "DCAT Theme",
"dct_subject_sm": "Dublin Core Subject",
"dcat_keyword_sm": "DCAT; Keyword",
"dcat_theme_sm": "DCAT; Theme",
"dct_spatial_sm": "Dublin Core; Spatial",
"dct_subject_sm": "Dublin Core; Subject",
"gbl_resourceClass_sm": "Subject scheme not provided",
"gbl_resourceType_sm": "Subject scheme not provided",
}
Expand Down
Loading