-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Why these changes are being introduced: * This is the initial structure for the Aardvark transform class. The class will be expanded with new methods in subsequent commits. How this addresses that need: * Add jsonlines to Pipfile * Add fixtures for aardvark and generic JSONLines files * Update argument type hinting for Transformer and JsonTransformer classes to clarify expected content types * Update JsonTransformer.parse_source_file method to use jsonlines library * Add Aardvark class with get_main_titles, get_source_record_id, record_is_deleted (in progress), get_optional_fields (in progress), and get_subjects methods and corresponding unit tests Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/GDT-54
- Loading branch information
Showing
8 changed files
with
677 additions
and
131 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"id": "123", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "dct_title_s": "Test title 1"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"id": "123"} | ||
{"id": "456"} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import transmogrifier.models as timdex | ||
from transmogrifier.sources.json.aardvark import Aardvark | ||
|
||
|
||
def test_aardvark_get_required_fields_returns_expected_values(json_records): | ||
transformer = Aardvark("cool-repo", json_records) | ||
assert transformer.get_required_fields(next(json_records)) == { | ||
"source": "A Cool Repository", | ||
"source_link": "https://example.com/123", | ||
"timdex_record_id": "cool-repo:123", | ||
"title": "Title not provided", | ||
} | ||
|
||
|
||
def test_jsontransformer_transform_returns_timdex_record(json_records): | ||
transformer = Aardvark("cool-repo", json_records) | ||
assert next(transformer) == timdex.TimdexRecord( | ||
source="A Cool Repository", | ||
source_link="https://example.com/123", | ||
timdex_record_id="cool-repo:123", | ||
title="Title not provided", | ||
citation="Title not provided. Geospatial data. https://example.com/123", | ||
content_type=["Geospatial data"], | ||
) | ||
|
||
|
||
def test_aardvark_get_main_titles_success(aardvark_record_all_fields): | ||
assert Aardvark.get_main_titles(aardvark_record_all_fields) == ["Test title 1"] | ||
|
||
|
||
def test_aardvark_get_source_record_id_success(aardvark_record_all_fields): | ||
assert Aardvark.get_source_record_id(aardvark_record_all_fields) == "123" | ||
|
||
|
||
def test_aardvark_get_subjects_success(aardvark_record_all_fields): | ||
assert Aardvark.get_subjects(aardvark_record_all_fields) == [ | ||
timdex.Subject(value=["Country"], kind="DCAT Keyword"), | ||
timdex.Subject(value=["Political boundaries"], kind="DCAT Theme"), | ||
timdex.Subject(value=["Geography"], kind="Dublin Core Subject"), | ||
timdex.Subject(value=["Earth"], kind="Dublin Core Subject"), | ||
timdex.Subject(value=["Dataset"], kind="Subject scheme not provided"), | ||
timdex.Subject(value=["Vector data"], kind="Subject scheme not provided"), | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
import logging | ||
|
||
import transmogrifier.models as timdex | ||
from transmogrifier.sources.transformer import JsonTransformer | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class Aardvark(JsonTransformer): | ||
"""Aardvark transformer.""" | ||
|
||
@classmethod | ||
def get_main_titles(cls, source_record: dict) -> list[str]: | ||
""" | ||
Retrieve main title(s) from a Aardvark JSON record. | ||
Overrides metaclass get_main_titles() method. | ||
Args: | ||
source_record: A JSON object representing a source record. | ||
""" | ||
titles = [] | ||
if title := "dct_title_s" in source_record and source_record["dct_title_s"]: | ||
titles.append(title) | ||
return titles | ||
|
||
@classmethod | ||
def get_source_record_id(cls, source_record: dict) -> str: | ||
""" | ||
Get source record ID from a JSON record. | ||
Args: | ||
source_record: A JSON object representing a source record. | ||
""" | ||
return source_record["id"] | ||
|
||
@classmethod | ||
def record_is_deleted(cls, source_record: dict) -> bool: | ||
""" | ||
Determine whether record has a status of deleted. | ||
## WIP - defining to enable instantiation of Aardvark instance. | ||
Args: | ||
source_record: A JSON object representing a source record. | ||
""" | ||
return False | ||
|
||
def get_optional_fields(self, source_record: dict) -> dict | None: | ||
""" | ||
Retrieve optional TIMDEX fields from a Aardvar JSON record. | ||
Overrides metaclass get_optional_fields() method. | ||
Args: | ||
xml: A BeautifulSoup Tag representing a single Datacite record in | ||
oai_datacite XML. | ||
""" | ||
fields: dict = {} | ||
|
||
# alternate_titles field not used in Aardvark | ||
|
||
# content_type | ||
fields["content_type"] = ["Geospatial data"] | ||
|
||
# contributors | ||
|
||
# dates | ||
|
||
# edition | ||
|
||
# format | ||
|
||
# funding_information | ||
|
||
# identifiers | ||
|
||
# languages | ||
fields["languages"] = source_record.get("dct_langauge_sm") | ||
|
||
# links | ||
|
||
# locations | ||
|
||
# notes | ||
|
||
# publication_information | ||
|
||
# related_items | ||
|
||
# rights | ||
|
||
# subjects | ||
fields["subjects"] = self.get_subjects(source_record) or None | ||
|
||
# summary field | ||
return fields | ||
|
||
@staticmethod | ||
def get_subjects(source_record: dict) -> list[timdex.Subject]: | ||
"""Get values from source record for TIMDEX subjects field. | ||
Args: | ||
source_record: A JSON object representing a source record. | ||
""" | ||
subjects = [] | ||
aardvark_subject_fields = { | ||
"dcat_keyword_sm": "DCAT Keyword", | ||
"dcat_theme_sm": "DCAT Theme", | ||
"dct_subject_sm": "Dublin Core Subject", | ||
"gbl_resourceClass_sm": "Subject scheme not provided", | ||
"gbl_resourceType_sm": "Subject scheme not provided", | ||
} | ||
for aardvark_subject_field, kind_value in { | ||
key: value | ||
for key, value in aardvark_subject_fields.items() | ||
if key in source_record | ||
}.items(): | ||
for subject in source_record[aardvark_subject_field]: | ||
subjects.append(timdex.Subject(value=[subject], kind=kind_value)) | ||
return subjects |
Oops, something went wrong.