Skip to content

Commit

Permalink
GDT-54 Create Aardvark transform
Browse files Browse the repository at this point in the history
Why these changes are being introduced:
* This is the initial structure for the Aardvark transform class. The class will be expanded with new methods in subsequent commits.

How this addresses that need:
* Add jsonlines to Pipfile
* Add fixtures for aardvark and generic JSONLines files
* Update argument type hinting for Transformer and JsonTransformer classes to clarify expected content types
* Update JsonTransformer.parse_source_file method to use jsonlines library
* Add Aardvark class with get_main_titles, get_source_record_id, record_is_deleted (in progress), get_optional_fields (in progress), and get_subjects methods and corresponding unit tests

Side effects of this change:
* None

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/GDT-54
  • Loading branch information
ehanson8 committed Dec 14, 2023
1 parent 5f061bb commit 1cfb37b
Show file tree
Hide file tree
Showing 8 changed files with 677 additions and 131 deletions.
3 changes: 2 additions & 1 deletion Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@ name = "pypi"
attrs = "*"
beautifulsoup4 = "*"
click = "*"
jsonlines = "*"
lxml = "*"
python-dateutil = "*"
sentry-sdk = "*"
smart-open = {version = "*", extras = ["s3"]}
python-dateutil = "*"
types-python-dateutil = "*"

[dev-packages]
Expand Down
568 changes: 461 additions & 107 deletions Pipfile.lock

Large diffs are not rendered by default.

16 changes: 15 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import transmogrifier.models as timdex
from transmogrifier.config import SOURCES, load_external_config
from transmogrifier.sources.transformer import XmlTransformer
from transmogrifier.sources.transformer import JsonTransformer, XmlTransformer
from transmogrifier.sources.xml.datacite import Datacite


Expand Down Expand Up @@ -46,6 +46,15 @@ def runner():
return CliRunner()


@pytest.fixture
def aardvark_record_all_fields():
return next(
JsonTransformer.parse_source_file(
"tests/fixtures/aardvark/aardvark_record_all_fields.jsonl"
)
)


@pytest.fixture()
def datacite_records():
return XmlTransformer.parse_source_file(
Expand All @@ -61,6 +70,11 @@ def datacite_record_all_fields():
return Datacite("cool-repo", source_records)


@pytest.fixture()
def json_records():
return JsonTransformer.parse_source_file("tests/fixtures/json_records.jsonl")


@pytest.fixture()
def loc_country_crosswalk():
return load_external_config("config/loc-countries.xml", "xml")
Expand Down
1 change: 1 addition & 0 deletions tests/fixtures/aardvark/aardvark_record_all_fields.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id": "123", "dcat_keyword_sm": ["Country"], "dcat_theme_sm": ["Political boundaries"], "dct_spatial_sm": ["Some city, Some country"], "dct_subject_sm": ["Geography", "Earth"], "gbl_resourceClass_sm": ["Dataset"], "gbl_resourceType_sm": ["Vector data"], "dct_title_s": "Test title 1"}
2 changes: 2 additions & 0 deletions tests/fixtures/json_records.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"id": "123"}
{"id": "456"}
43 changes: 43 additions & 0 deletions tests/sources/json/test_aardvark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import transmogrifier.models as timdex
from transmogrifier.sources.json.aardvark import Aardvark


def test_aardvark_get_required_fields_returns_expected_values(json_records):
transformer = Aardvark("cool-repo", json_records)
assert transformer.get_required_fields(next(json_records)) == {
"source": "A Cool Repository",
"source_link": "https://example.com/123",
"timdex_record_id": "cool-repo:123",
"title": "Title not provided",
}


def test_jsontransformer_transform_returns_timdex_record(json_records):
transformer = Aardvark("cool-repo", json_records)
assert next(transformer) == timdex.TimdexRecord(
source="A Cool Repository",
source_link="https://example.com/123",
timdex_record_id="cool-repo:123",
title="Title not provided",
citation="Title not provided. Geospatial data. https://example.com/123",
content_type=["Geospatial data"],
)


def test_aardvark_get_main_titles_success(aardvark_record_all_fields):
assert Aardvark.get_main_titles(aardvark_record_all_fields) == ["Test title 1"]


def test_aardvark_get_source_record_id_success(aardvark_record_all_fields):
assert Aardvark.get_source_record_id(aardvark_record_all_fields) == "123"


def test_aardvark_get_subjects_success(aardvark_record_all_fields):
assert Aardvark.get_subjects(aardvark_record_all_fields) == [
timdex.Subject(value=["Country"], kind="DCAT Keyword"),
timdex.Subject(value=["Political boundaries"], kind="DCAT Theme"),
timdex.Subject(value=["Geography"], kind="Dublin Core Subject"),
timdex.Subject(value=["Earth"], kind="Dublin Core Subject"),
timdex.Subject(value=["Dataset"], kind="Subject scheme not provided"),
timdex.Subject(value=["Vector data"], kind="Subject scheme not provided"),
]
121 changes: 121 additions & 0 deletions transmogrifier/sources/json/aardvark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import logging

import transmogrifier.models as timdex
from transmogrifier.sources.transformer import JsonTransformer

logger = logging.getLogger(__name__)


class Aardvark(JsonTransformer):
"""Aardvark transformer."""

@classmethod
def get_main_titles(cls, source_record: dict) -> list[str]:
"""
Retrieve main title(s) from a Aardvark JSON record.
Overrides metaclass get_main_titles() method.
Args:
source_record: A JSON object representing a source record.
"""
titles = []
if title := "dct_title_s" in source_record and source_record["dct_title_s"]:
titles.append(title)
return titles

@classmethod
def get_source_record_id(cls, source_record: dict) -> str:
"""
Get source record ID from a JSON record.
Args:
source_record: A JSON object representing a source record.
"""
return source_record["id"]

@classmethod
def record_is_deleted(cls, source_record: dict) -> bool:
"""
Determine whether record has a status of deleted.
## WIP - defining to enable instantiation of Aardvark instance.
Args:
source_record: A JSON object representing a source record.
"""
return False

def get_optional_fields(self, source_record: dict) -> dict | None:
"""
Retrieve optional TIMDEX fields from a Aardvar JSON record.
Overrides metaclass get_optional_fields() method.
Args:
xml: A BeautifulSoup Tag representing a single Datacite record in
oai_datacite XML.
"""
fields: dict = {}

# alternate_titles field not used in Aardvark

# content_type
fields["content_type"] = ["Geospatial data"]

# contributors

# dates

# edition

# format

# funding_information

# identifiers

# languages
fields["languages"] = source_record.get("dct_langauge_sm")

# links

# locations

# notes

# publication_information

# related_items

# rights

# subjects
fields["subjects"] = self.get_subjects(source_record) or None

# summary field
return fields

@staticmethod
def get_subjects(source_record: dict) -> list[timdex.Subject]:
"""Get values from source record for TIMDEX subjects field.
Args:
source_record: A JSON object representing a source record.
"""
subjects = []
aardvark_subject_fields = {
"dcat_keyword_sm": "DCAT Keyword",
"dcat_theme_sm": "DCAT Theme",
"dct_subject_sm": "Dublin Core Subject",
"gbl_resourceClass_sm": "Subject scheme not provided",
"gbl_resourceType_sm": "Subject scheme not provided",
}
for aardvark_subject_field, kind_value in {
key: value
for key, value in aardvark_subject_fields.items()
if key in source_record
}.items():
for subject in source_record[aardvark_subject_field]:
subjects.append(timdex.Subject(value=[subject], kind=kind_value))
return subjects
Loading

0 comments on commit 1cfb37b

Please sign in to comment.