From 2df0ab38ec048e56f63d1634162ed233fb9febd4 Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Fri, 26 Jan 2024 15:12:13 -0500 Subject: [PATCH] Update links handling for Aardvark transform Why these changes are being introduced: With updates to the structure of dct_references_s in GeoHarvester, Transmogrifier needs updates as well for parsing that JSON string. How this addresses that need: * Field method get_links() is updated to handle new url / website type in JSON string * Updating to http vs https for schema.org URIs to be consistent with other OGM metadata Side effects of this change: * Additional links in TIMDEX record Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/GDT-149 --- .../aardvark/aardvark_record_all_fields.jsonl | 2 +- tests/sources/json/test_aardvark.py | 19 ++++++++++++++----- transmogrifier/sources/json/aardvark.py | 10 +++++++++- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/tests/fixtures/aardvark/aardvark_record_all_fields.jsonl b/tests/fixtures/aardvark/aardvark_record_all_fields.jsonl index 30c32bc..55e3d56 100644 --- a/tests/fixtures/aardvark/aardvark_record_all_fields.jsonl +++ b/tests/fixtures/aardvark/aardvark_record_all_fields.jsonl @@ -1 +1 @@ -{"id": "mit:123", "dcat_bbox": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_accessRights_s": "Access note", "dct_alternative_sm": ["Alternate title"], "dct_creator_sm": ["Smith, Jane", "Smith, John"], "dct_description_sm": ["A description"], "dct_format_s": "Shapefile", "dct_identifier_sm": ["abc123"], "dct_issued_s": "2003-10-23", "dct_language_sm": ["eng"], "dct_license_sm": ["http://license.license", "http://another_license.another_license"], "dct_publisher_sm": ["ML InfoMap (Firm)"], "dct_references_s": "{\"https://schema.org/downloadUrl\": [{\"label\": \"Source Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.source.fgdc.xml\"}, {\"label\": \"Normalized Metadata\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.normalized.aardvark.json\"}, {\"label\": \"Data Zipfile\", \"protocol\": \"Download\", \"url\": \"https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.zip\"}]}", "dct_rights_sm": ["Some person has the rights"], "dct_rightsHolder_sm": ["The person with the rights", "Another person with the rights"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "dct_temporal_sm": ["1943", "1979"], "dct_title_s": "Test title 1", "gbl_dateRange_drsim": ["[1943 TO 1946]"], "gbl_displayNote_sm": ["Danger: This text will be displayed in a red box","Info: This text will be displayed in a blue box","Tip: This text will be displayed in a green box","Warning: This text will be displayed in a yellow box","This is text without a tag and it will be assigned default 'note' style"], "gbl_indexYear_im": [1943,1944,1945,1946], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "gbl_suppressed_b": false, "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "schema_provider_s": "MIT"} \ No newline at end of file +{"id": "mit:123", "dcat_bbox": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_accessRights_s": "Access note", "dct_alternative_sm": ["Alternate title"], "dct_creator_sm": ["Smith, Jane", "Smith, John"], "dct_description_sm": ["A description"], "dct_format_s": "Shapefile", "dct_identifier_sm": ["abc123"], "dct_issued_s": "2003-10-23", "dct_language_sm": ["eng"], "dct_license_sm": ["http://license.license", "http://another_license.another_license"], "dct_publisher_sm": ["ML InfoMap (Firm)"], "dct_references_s": "{\"http://schema.org/downloadUrl\": [{\"label\": \"Source Metadata\", \"url\": \"https://cdn.dev1.mitlibrary.net/geo/public/GISPORTAL_GISOWNER01_BOSTONWATER95.source.fgdc.xml\"}, {\"label\": \"Aardvark Metadata\", \"url\": \"https://cdn.dev1.mitlibrary.net/geo/public/GISPORTAL_GISOWNER01_BOSTONWATER95.normalized.aardvark.json\"}, {\"label\": \"Data\", \"url\": \"https://cdn.dev1.mitlibrary.net/geo/public/GISPORTAL_GISOWNER01_BOSTONWATER95.zip\"}], \"http://schema.org/url\": [{\"label\": \"Website\", \"url\": \"https://search.libraries.mit.edu/record/gismit:GISPORTAL_GISOWNER01_BOSTONWATER95\"}]}", "dct_rights_sm": ["Some person has the rights"], "dct_rightsHolder_sm": ["The person with the rights", "Another person with the rights"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "dct_temporal_sm": ["1943", "1979"], "dct_title_s": "Test title 1", "gbl_dateRange_drsim": ["[1943 TO 1946]"], "gbl_displayNote_sm": ["Danger: This text will be displayed in a red box","Info: This text will be displayed in a blue box","Tip: This text will be displayed in a green box","Warning: This text will be displayed in a yellow box","This is text without a tag and it will be assigned default 'note' style"], "gbl_indexYear_im": [1943,1944,1945,1946], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "gbl_suppressed_b": false, "locn_geometry": "ENVELOPE(-111.1, -104.0, 45.0, 40.9)", "schema_provider_s": "MIT"} \ No newline at end of file diff --git a/tests/sources/json/test_aardvark.py b/tests/sources/json/test_aardvark.py index 5df5cbb..c5bca1d 100644 --- a/tests/sources/json/test_aardvark.py +++ b/tests/sources/json/test_aardvark.py @@ -161,20 +161,29 @@ def test_aardvark_get_identifiers_success(aardvark_record_all_fields): def test_aardvark_get_links_success(aardvark_record_all_fields): assert MITAardvark.get_links(next(aardvark_record_all_fields), "123") == [ timdex.Link( - url="https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.source.fgdc.xml", + url="https://cdn.dev1.mitlibrary.net/geo/public" + "/GISPORTAL_GISOWNER01_BOSTONWATER95.source.fgdc.xml", kind="Download", text="Source Metadata", ), timdex.Link( - url="https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95." + url="https://cdn.dev1.mitlibrary.net/geo/public" + "/GISPORTAL_GISOWNER01_BOSTONWATER95." "normalized.aardvark.json", kind="Download", - text="Normalized Metadata", + text="Aardvark Metadata", ), timdex.Link( - url="https://example.com/GISPORTAL_GISOWNER01_BOSTONWATER95.zip", + url="https://cdn.dev1.mitlibrary.net/geo/public" + "/GISPORTAL_GISOWNER01_BOSTONWATER95.zip", kind="Download", - text="Data Zipfile", + text="Data", + ), + timdex.Link( + url="https://search.libraries.mit.edu/record/gismit" + ":GISPORTAL_GISOWNER01_BOSTONWATER95", + kind="Website", + text="Website", ), ] diff --git a/transmogrifier/sources/json/aardvark.py b/transmogrifier/sources/json/aardvark.py index f0c24b7..9a345b8 100644 --- a/transmogrifier/sources/json/aardvark.py +++ b/transmogrifier/sources/json/aardvark.py @@ -285,7 +285,15 @@ def get_links(source_record: dict, source_record_id: str) -> list[timdex.Link]: timdex.Link( url=link.get("url"), kind="Download", text=link.get("label") ) - for link in links_object.get("https://schema.org/downloadUrl") + for link in links_object.get("http://schema.org/downloadUrl", []) + ] + ) + links.extend( + [ + timdex.Link( + url=link.get("url"), kind="Website", text=link.get("label") + ) + for link in links_object.get("http://schema.org/url", []) ] ) except ValueError: