diff --git a/.gitignore b/.gitignore
index 38b13f4..8e8d033 100644
--- a/.gitignore
+++ b/.gitignore
@@ -87,6 +87,9 @@ ipython_config.py
# intended to run in multiple environments; otherwise, check them in:
# .python-version
+# asdf
+.tool-versions
+
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
diff --git a/tests/conftest.py b/tests/conftest.py
index 2785419..0c93b89 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -89,17 +89,11 @@ def marc_content_type_crosswalk():
return load_external_config("config/marc_content_type_crosswalk.json", "json")
-# oaidc ##########################
-
-
@pytest.fixture
def oai_pmh_records():
return XMLTransformer.parse_source_file("tests/fixtures/oai_pmh_records.xml")
-# timdex ##########################
-
-
@pytest.fixture
def timdex_record_required_fields():
return timdex.TimdexRecord(
diff --git a/tests/fixtures/oai_dc/oaidc_record_valid_generic_date.xml b/tests/fixtures/oai_dc/oaidc_record_valid_generic_date.xml
deleted file mode 100644
index a378095..0000000
--- a/tests/fixtures/oai_dc/oaidc_record_valid_generic_date.xml
+++ /dev/null
@@ -1,26 +0,0 @@
-
-
-
-
- oai:libguides.com:guides/175846
- 2023-05-31T19:49:21Z
- guides
-
-
-
- Materials Science & Engineering
- Ye Li
- Engineering
- Science
- Useful databases and other research tips for materials science.
- MIT Libraries
- 2008-06-19T17:55:27
- https://libguides.mit.edu/materials
-
-
-
-
\ No newline at end of file
diff --git a/tests/sources/xml/test_oai_dc.py b/tests/sources/xml/test_oai_dc.py
index 2b98737..f7ce89e 100644
--- a/tests/sources/xml/test_oai_dc.py
+++ b/tests/sources/xml/test_oai_dc.py
@@ -1,36 +1,48 @@
+import pytest
+from bs4 import BeautifulSoup
+
import transmogrifier.models as timdex
+from transmogrifier.exceptions import SkippedRecordEvent
from transmogrifier.sources.xml.oaidc import OaiDc
-FIXTURES_PREFIX = "tests/fixtures/oai_dc"
-
-BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX = timdex.TimdexRecord(
- source="LibGuides",
- source_link="https://libguides.mit.edu/guides/175846",
- timdex_record_id="libguides:guides-175846",
- title="Materials Science & Engineering",
- citation="Materials Science & Engineering. libguides. "
- "https://libguides.mit.edu/guides/175846",
- content_type=["libguides"],
- format="electronic resource",
- identifiers=[
- timdex.Identifier(value="oai:libguides.com:guides/175846", kind="OAI-PMH")
- ],
-)
+def create_oaidc_source_record_stub(
+ header_insert: str = "", metadata_insert: str = ""
+) -> BeautifulSoup:
+
+ xml_str = f"""
+
+
+
+
+
+ {metadata_insert}
+
+
+
+ Ye Li
+ """
+ )
+ )
+ assert OaiDc.get_contributors(source_record) == [
+ timdex.Contributor(
+ value="Ye Li",
+ kind="Creator",
+ )
+ ]
+
+
+def test_get_contributors_transforms_correctly_if_fields_blank():
+ source_record = create_oaidc_source_record_stub(
+ metadata_insert=(
+ """
+
+ """
+ )
+ )
+ assert OaiDc.get_contributors(source_record) is None
+
+
+def test_get_contributors_transforms_correctly_if_fields_missing():
+ source_record = create_oaidc_source_record_stub()
+ assert OaiDc.get_contributors(source_record) is None
+
+
+def test_get_dates_success():
+ source_record = create_oaidc_source_record_stub(
+ header_insert=(
+ """
+ oai:libguides.com:guides/175846
+ """
+ ),
+ metadata_insert=(
+ """
+ 2008-06-19T17:55:27
+ """
+ ),
+ )
+ assert OaiDc.get_dates(source_record) == [
+ timdex.Date(kind="Unknown", value="2008-06-19T17:55:27")
+ ]
+
+
+def test_get_dates_transforms_correctly_if_fields_blank():
+ source_record = create_oaidc_source_record_stub(
+ header_insert=(
+ """
+ oai:libguides.com:guides/175846
+ """
+ ),
+ metadata_insert=(
+ """
+
+ """
+ ),
+ )
+ assert OaiDc.get_dates(source_record) is None
+
+
+def test_get_dates_transforms_correctly_if_fields_missing():
+ source_record = create_oaidc_source_record_stub(
+ header_insert=(
+ """
+ oai:libguides.com:guides/175846
+ """
+ )
+ )
+ assert OaiDc.get_dates(source_record) is None
+
+
+def test_get_dates_transforms_correctly_if_date_invalid():
+ source_record = create_oaidc_source_record_stub(
+ header_insert=(
+ """
+ oai:libguides.com:guides/175846
+ """
+ ),
+ metadata_insert=(
+ """
+ INVALID
+ """
+ ),
+ )
+ assert OaiDc.get_dates(source_record) is None
+
+
+def test_get_identifiers_success():
+ source_record = create_oaidc_source_record_stub(
+ header_insert=(
+ """
+ oai:libguides.com:guides/175846
+ """
+ )
)
- transformer_instance = OaiDc("libguides", source_records)
- xml = next(transformer_instance.source_records)
- assert transformer_instance.get_dates("test_source_record_id", xml) == [
- timdex.Date(kind="Unknown", note=None, range=None, value="2008-06-19T17:55:27")
+ assert OaiDc.get_identifiers(source_record) == [
+ timdex.Identifier(value="oai:libguides.com:guides/175846", kind="OAI-PMH")
]
+
+
+def test_get_identifiers_transforms_correctly_if_fields_blank():
+ source_record = create_oaidc_source_record_stub(
+ header_insert=(
+ """
+
+ """
+ )
+ )
+ assert OaiDc.get_identifiers(source_record) is None
+
+
+def test_get_identifiers_transforms_correctly_if_fields_missing():
+ source_record = create_oaidc_source_record_stub()
+ assert OaiDc.get_identifiers(source_record) is None
+
+
+def test_get_publishers_success():
+ source_record = create_oaidc_source_record_stub(
+ metadata_insert=(
+ """
+ MIT Libraries
+ """
+ )
+ )
+ assert OaiDc.get_publishers(source_record) == [timdex.Publisher(name="MIT Libraries")]
+
+
+def test_get_publishers_transforms_correctly_if_fields_blank():
+ source_record = create_oaidc_source_record_stub(
+ metadata_insert=(
+ """
+
+ """
+ )
+ )
+ assert OaiDc.get_publishers(source_record) is None
+
+
+def test_get_publishers_transforms_correctly_if_fields_missing():
+ source_record = create_oaidc_source_record_stub()
+ assert OaiDc.get_publishers(source_record) is None
+
+
+def test_get_subjects_success():
+ source_record = create_oaidc_source_record_stub(
+ metadata_insert=(
+ """
+ Engineering
+ Science
+ """
+ )
+ )
+ assert OaiDc.get_subjects(source_record) == [
+ timdex.Subject(
+ value=["Engineering", "Science"], kind="Subject scheme not provided"
+ )
+ ]
+
+
+def test_get_subjects_transforms_correctly_if_fields_blank():
+ source_record = create_oaidc_source_record_stub(
+ metadata_insert=(
+ """
+
+ """
+ )
+ )
+ assert OaiDc.get_subjects(source_record) is None
+
+
+def test_get_subjects_transforms_correctly_if_fields_missing():
+ source_record = create_oaidc_source_record_stub()
+ assert OaiDc.get_subjects(source_record) is None
+
+
+def test_get_summary_success():
+ source_record = create_oaidc_source_record_stub(
+ metadata_insert=(
+ ""
+ "Useful databases and other research tips for materials science."
+ ""
+ )
+ )
+ assert OaiDc.get_summary(source_record) == [
+ "Useful databases and other research tips for materials science."
+ ]
+
+
+def test_get_summary_transforms_properly_if_fields_blank():
+ source_record = create_oaidc_source_record_stub(
+ metadata_insert=(
+ """
+
+ """
+ )
+ )
+ assert OaiDc.get_summary(source_record) is None
+
+
+def test_get_summary_transforms_properly_if_fields_missing():
+ source_record = create_oaidc_source_record_stub()
+ assert OaiDc.get_summary(source_record) is None
+
+
+def test_get_main_titles_success():
+ source_record = create_oaidc_source_record_stub(
+ metadata_insert=(
+ """
+ Materials Science & Engineering
+ """
+ )
+ )
+ assert OaiDc.get_main_titles(source_record) == ["Materials Science & Engineering"]
+
+
+def test_get_main_titles_transforms_properly_if_fields_blank():
+ source_record = create_oaidc_source_record_stub(
+ metadata_insert=(
+ """
+
+ """
+ )
+ )
+ assert OaiDc.get_main_titles(source_record) == []
+
+
+def test_get_main_titles_transforms_properly_if_fields_missing():
+ source_record = create_oaidc_source_record_stub()
+ assert OaiDc.get_main_titles(source_record) == []
+
+
+def test_get_source_record_id_success():
+ source_record = create_oaidc_source_record_stub(
+ header_insert=(
+ """
+ oai:libguides.com:guides/175846
+ """
+ )
+ )
+ assert OaiDc.get_source_record_id(source_record) == "guides/175846"
+
+
+def test_get_source_record_id_raises_skipped_record_event_if_fields_blank():
+ source_record = create_oaidc_source_record_stub(
+ header_insert=(
+ """
+
+ """
+ )
+ )
+ with pytest.raises(
+ SkippedRecordEvent,
+ match=(
+ "Record skipped because 'source_record_id' could not be derived. "
+ "The 'identifier' was either missing from the header element or blank."
+ ),
+ ):
+ OaiDc.get_source_record_id(source_record)
+
+
+def test_get_source_record_id_raises_skipped_record_event_if_fields_missing():
+ source_record = create_oaidc_source_record_stub()
+ with pytest.raises(
+ SkippedRecordEvent,
+ match=(
+ "Record skipped because 'source_record_id' could not be derived. "
+ "The 'identifier' was either missing from the header element or blank."
+ ),
+ ):
+ OaiDc.get_source_record_id(source_record)
diff --git a/tests/sources/xml/test_springshare.py b/tests/sources/xml/test_springshare.py
index f02c96f..56973d6 100644
--- a/tests/sources/xml/test_springshare.py
+++ b/tests/sources/xml/test_springshare.py
@@ -1,97 +1,222 @@
+# ruff: noqa: E501
import logging
+import pytest
+from bs4 import BeautifulSoup
+
import transmogrifier.models as timdex
+from transmogrifier.exceptions import SkippedRecordEvent
from transmogrifier.sources.xml.springshare import SpringshareOaiDc
SPRINGSHARE_FIXTURES_PREFIX = "tests/fixtures/oai_dc/springshare"
-
LIBGUIDES_FIXTURES_PREFIX = f"{SPRINGSHARE_FIXTURES_PREFIX}/libguides"
-
RESEARCHDATABASES_FIXTURES_PREFIX = f"{SPRINGSHARE_FIXTURES_PREFIX}/research_databases"
-LIBGUIDES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX = timdex.TimdexRecord(
- source="LibGuides",
- source_link="https://libguides.mit.edu/materials",
- timdex_record_id="libguides:guides-175846",
- title="Materials Science & Engineering",
- citation="Materials Science & Engineering. libguides. "
- "https://libguides.mit.edu/materials",
- content_type=["libguides"],
- format="electronic resource",
- identifiers=[
- timdex.Identifier(value="oai:libguides.com:guides/175846", kind="OAI-PMH")
- ],
- links=[
+def create_oaidc_source_record_stub(
+ header_insert: str = "", metadata_insert: str = ""
+) -> BeautifulSoup:
+
+ xml_str = f"""
+
+
+
+
+
+ {metadata_insert}
+
+
+
+ oai:libguides.com:guides/175846
+ """
+ ),
+ metadata_insert=(
+ """
+ January 1st, 2000
+ """
+ ),
+ )
+ assert SpringshareOaiDc.get_dates(source_record) == [
+ timdex.Date(kind="Created", value="2000-01-01T00:00:00")
+ ]
+
+
+def test_get_dates_transforms_correctly_if_optional_fields_blank():
+ source_record = create_oaidc_source_record_stub(
+ header_insert=(
+ """
+ oai:libguides.com:guides/175846
+ """
+ ),
+ metadata_insert=(
+ """
+
+ """
+ ),
+ )
+ assert SpringshareOaiDc.get_dates(source_record) is None
+
+
+def test_get_dates_transforms_correctly_if_optional_fields_missing():
+ source_record = create_oaidc_source_record_stub(
+ header_insert=(
+ """
+ oai:libguides.com:guides/175846
+ """
+ )
+ )
+ assert SpringshareOaiDc.get_dates(source_record) is None
+
+
+def test_get_dates_transforms_correctly_and_logs_error_if_date_invalid(
+ caplog,
+):
+ caplog.set_level(logging.DEBUG)
+ source_record = create_oaidc_source_record_stub(
+ header_insert=(
+ """
+ oai:libguides.com:guides/175846
+ """
+ ),
+ metadata_insert=(
+ """
+ INVALID
+ """
+ ),
+ )
+ assert SpringshareOaiDc.get_dates(source_record) is None
+ assert (
+ "Record ID guides/175846 has a date that cannot be parsed: Unknown string format: INVALID"
+ in caplog.text
+ )
+
+
+def test_get_links_success():
+ source_record = create_oaidc_source_record_stub(
+ header_insert=(
+ """
+ oai:libguides.com:guides/175846
+ """
+ ),
+ metadata_insert=(
+ """
+ https://libguides.mit.edu/materials
+ """
+ ),
+ )
+ assert SpringshareOaiDc("libguides", iter(())).get_links(
+ source_record=source_record
+ ) == [
timdex.Link(
- url="https://libguides.mit.edu/materials",
kind="LibGuide URL",
text="LibGuide URL",
+ url="https://libguides.mit.edu/materials",
)
- ],
-)
-
-RESEARCHDATABASES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX = timdex.TimdexRecord(
- source="Research Databases",
- source_link="https://libguides.mit.edu/llba",
- timdex_record_id="researchdatabases:az-65257807",
- title="Linguistics and Language Behavior Abstracts (LLBA)",
- citation="Linguistics and Language Behavior Abstracts (LLBA). researchdatabases. "
- "https://libguides.mit.edu/llba",
- content_type=["researchdatabases"],
- format="electronic resource",
- identifiers=[
- timdex.Identifier(value="oai:libguides.com:az/65257807", kind="OAI-PMH")
- ],
- links=[
- timdex.Link(
- url="https://libguides.mit.edu/llba",
- kind="Research Database URL",
- text="Research Database URL",
+ ]
+
+
+def test_get_links_transforms_correctly_if_required_fields_blank():
+ source_record = create_oaidc_source_record_stub(
+ header_insert=(
+ """
+ oai:libguides.com:guides/175846
+ """
+ ),
+ metadata_insert=(
+ """
+
+ """
+ ),
+ )
+ assert SpringshareOaiDc("libguides", iter(())).get_links(source_record) is None
+
+
+def test_get_links_transforms_correctly_if_required_fields_missing():
+ source_record = create_oaidc_source_record_stub(
+ header_insert=(
+ """
+ oai:libguides.com:guides/175846
+ """
)
- ],
-)
+ )
+ assert SpringshareOaiDc("libguides", iter(())).get_links(source_record) is None
-def test_springshare_get_dates_valid():
- source_records = SpringshareOaiDc.parse_source_file(
- f"{SPRINGSHARE_FIXTURES_PREFIX}/springshare_valid_dates.xml"
- )
- transformer_instance = SpringshareOaiDc("libguides", source_records)
- for xml in transformer_instance.source_records:
- date_field_value = transformer_instance.get_dates("test_get_dates", xml)
- assert date_field_value == [
- timdex.Date(
- kind="Created", note=None, range=None, value="2000-01-01T00:00:00"
- )
- ]
+def test_get_source_link_success():
+ source_record = create_oaidc_source_record_stub(
+ metadata_insert=(
+ """
+ https://libguides.mit.edu/materials
+ """
+ )
+ )
+ assert (
+ SpringshareOaiDc.get_source_link("", "", source_record)
+ == "https://libguides.mit.edu/materials"
+ )
-def test_springshare_get_dates_invalid_logged_and_skipped(caplog):
- caplog.set_level(logging.DEBUG)
- source_records = SpringshareOaiDc.parse_source_file(
- f"{SPRINGSHARE_FIXTURES_PREFIX}/springshare_invalid_dates.xml"
+def test_get_source_link_raises_skipped_record_event_if_required_fields_blank():
+ source_record = create_oaidc_source_record_stub(
+ metadata_insert=(
+ """
+
+ """
+ )
)
- transformer_instance = SpringshareOaiDc("libguides", source_records)
- for xml in transformer_instance.source_records:
- date_field_value = transformer_instance.get_dates("test_get_dates", xml)
- assert date_field_value == []
- assert "has a date that cannot be parsed" in caplog.text
+ with pytest.raises(
+ SkippedRecordEvent,
+ match=(
+ "Record skipped because 'source_link' could not be derived. "
+ "The 'identifier' was either missing from the header element or blank."
+ ),
+ ):
+ SpringshareOaiDc.get_source_link("", "", source_record)
-def test_springshare_get_links_missing_identifier_logged_and_skipped(caplog):
- caplog.set_level(logging.DEBUG)
- source_records = SpringshareOaiDc.parse_source_file(
- f"{SPRINGSHARE_FIXTURES_PREFIX}/springshare_record_missing_required_fields.xml"
+def test_get_source_link_raises_skipped_record_event_if_required_fields_missing():
+ source_record = create_oaidc_source_record_stub(
+ metadata_insert=(
+ """
+
+ """
+ )
)
- transformer_instance = SpringshareOaiDc("libguides", source_records)
- for xml in transformer_instance.source_records:
- links_field_value = transformer_instance.get_links("test_get_links", xml)
- assert links_field_value == []
- assert "has links that cannot be generated" in caplog.text
+ with pytest.raises(
+ SkippedRecordEvent,
+ match=(
+ "Record skipped because 'source_link' could not be derived. "
+ "The 'identifier' was either missing from the header element or blank."
+ ),
+ ):
+ SpringshareOaiDc.get_source_link("", "", source_record)
+
+###########################
+# Springshare - LibGuides
+###########################
-def test_libguide_transform_with_all_fields_transforms_correctly():
+
+def test_springshare_libguides_transform_with_all_fields_transforms_correctly():
source_records = SpringshareOaiDc.parse_source_file(
f"{LIBGUIDES_FIXTURES_PREFIX}/libguides_record_all_fields.xml"
)
@@ -135,23 +260,66 @@ def test_libguide_transform_with_all_fields_transforms_correctly():
)
-def test_libguides_transform_with_optional_fields_blank_transforms_correctly():
+def test_springshare_libguides_transform_with_optional_fields_blank_transforms_correctly():
source_records = SpringshareOaiDc.parse_source_file(
f"{LIBGUIDES_FIXTURES_PREFIX}/libguides_record_optional_fields_blank.xml"
)
output_records = SpringshareOaiDc("libguides", source_records)
- assert next(output_records) == LIBGUIDES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX
+ assert next(output_records) == timdex.TimdexRecord(
+ source="LibGuides",
+ source_link="https://libguides.mit.edu/materials",
+ timdex_record_id="libguides:guides-175846",
+ title="Materials Science & Engineering",
+ citation="Materials Science & Engineering. libguides. "
+ "https://libguides.mit.edu/materials",
+ content_type=["libguides"],
+ format="electronic resource",
+ identifiers=[
+ timdex.Identifier(value="oai:libguides.com:guides/175846", kind="OAI-PMH")
+ ],
+ links=[
+ timdex.Link(
+ url="https://libguides.mit.edu/materials",
+ kind="LibGuide URL",
+ text="LibGuide URL",
+ )
+ ],
+ )
-def test_libguides_transform_with_optional_fields_missing_transforms_correctly():
+def test_springshare_libguides_transform_with_optional_fields_missing_transforms_correctly():
source_records = SpringshareOaiDc.parse_source_file(
f"{LIBGUIDES_FIXTURES_PREFIX}/libguides_record_optional_fields_missing.xml"
)
output_records = SpringshareOaiDc("libguides", source_records)
- assert next(output_records) == LIBGUIDES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX
+ assert next(output_records) == timdex.TimdexRecord(
+ source="LibGuides",
+ source_link="https://libguides.mit.edu/materials",
+ timdex_record_id="libguides:guides-175846",
+ title="Materials Science & Engineering",
+ citation="Materials Science & Engineering. libguides. "
+ "https://libguides.mit.edu/materials",
+ content_type=["libguides"],
+ format="electronic resource",
+ identifiers=[
+ timdex.Identifier(value="oai:libguides.com:guides/175846", kind="OAI-PMH")
+ ],
+ links=[
+ timdex.Link(
+ url="https://libguides.mit.edu/materials",
+ kind="LibGuide URL",
+ text="LibGuide URL",
+ )
+ ],
+ )
-def test_research_databases_transform_with_all_fields_transforms_correctly():
+####################################
+# Springshare - Research Databases
+####################################
+
+
+def test_springshare_research_databases_transform_with_all_fields_transforms_correctly():
source_records = SpringshareOaiDc.parse_source_file(
f"{RESEARCHDATABASES_FIXTURES_PREFIX}/research_databases_record_all_fields.xml"
)
@@ -191,23 +359,57 @@ def test_research_databases_transform_with_all_fields_transforms_correctly():
)
-def test_research_databases_transform_with_optional_fields_blank_transforms_correctly():
+def test_springshare_research_databases_transform_with_optional_fields_blank_transforms_correctly():
source_records = SpringshareOaiDc.parse_source_file(
RESEARCHDATABASES_FIXTURES_PREFIX
+ "/research_databases_record_optional_fields_blank.xml"
)
output_records = SpringshareOaiDc("researchdatabases", source_records)
- assert (
- next(output_records) == RESEARCHDATABASES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX
+ assert next(output_records) == timdex.TimdexRecord(
+ source="Research Databases",
+ source_link="https://libguides.mit.edu/llba",
+ timdex_record_id="researchdatabases:az-65257807",
+ title="Linguistics and Language Behavior Abstracts (LLBA)",
+ citation="Linguistics and Language Behavior Abstracts (LLBA). researchdatabases. "
+ "https://libguides.mit.edu/llba",
+ content_type=["researchdatabases"],
+ format="electronic resource",
+ identifiers=[
+ timdex.Identifier(value="oai:libguides.com:az/65257807", kind="OAI-PMH")
+ ],
+ links=[
+ timdex.Link(
+ url="https://libguides.mit.edu/llba",
+ kind="Research Database URL",
+ text="Research Database URL",
+ )
+ ],
)
-def test_research_databases_transform_with_optional_fields_missing_transforms_correctly():
+def test_springshare_research_databases_transform_with_optional_fields_missing_transforms_correctly():
source_records = SpringshareOaiDc.parse_source_file(
RESEARCHDATABASES_FIXTURES_PREFIX
+ "/research_databases_record_optional_fields_missing.xml"
)
output_records = SpringshareOaiDc("researchdatabases", source_records)
- assert (
- next(output_records) == RESEARCHDATABASES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX
+ assert next(output_records) == timdex.TimdexRecord(
+ source="Research Databases",
+ source_link="https://libguides.mit.edu/llba",
+ timdex_record_id="researchdatabases:az-65257807",
+ title="Linguistics and Language Behavior Abstracts (LLBA)",
+ citation="Linguistics and Language Behavior Abstracts (LLBA). researchdatabases. "
+ "https://libguides.mit.edu/llba",
+ content_type=["researchdatabases"],
+ format="electronic resource",
+ identifiers=[
+ timdex.Identifier(value="oai:libguides.com:az/65257807", kind="OAI-PMH")
+ ],
+ links=[
+ timdex.Link(
+ url="https://libguides.mit.edu/llba",
+ kind="Research Database URL",
+ text="Research Database URL",
+ )
+ ],
)
diff --git a/transmogrifier/sources/xml/oaidc.py b/transmogrifier/sources/xml/oaidc.py
index 08c2ae3..952eb8f 100644
--- a/transmogrifier/sources/xml/oaidc.py
+++ b/transmogrifier/sources/xml/oaidc.py
@@ -3,6 +3,7 @@
from bs4 import Tag # type: ignore[import-untyped]
import transmogrifier.models as timdex
+from transmogrifier.exceptions import SkippedRecordEvent
from transmogrifier.helpers import validate_date
from transmogrifier.sources.xmltransformer import XMLTransformer
@@ -17,18 +18,15 @@ class OaiDc(XMLTransformer):
anticipated this will most likely get extended by a source-specific transformer.
"""
- def get_optional_fields(self, xml: Tag) -> dict | None:
+ def get_optional_fields(self, source_record: Tag) -> dict | None:
"""
Retrieve optional TIMDEX fields from a generic OAI DC XML record.
Args:
- xml: A BeautifulSoup Tag representing a single OAI DC XML record
+ source_record: A BeautifulSoup Tag representing a single OAI DC record in XML.
"""
fields: dict = {}
- # extract source_record_id early for use and logging
- source_record_id = self.get_source_record_id(xml)
-
# alternate_titles: not set in this transformation
# call_numbers: not set in this transformation
@@ -36,45 +34,34 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# citation: uses fallback get_citation() method
# content_type
- fields["content_type"] = [self.source]
+ fields["content_type"] = self.get_content_type()
# contents: not set in this transformation
# contributors
- for creator in [c for c in xml.find_all("dc:creator") if c.string]:
- fields.setdefault("contributors", []).append(
- timdex.Contributor(
- value=str(creator.string),
- kind="Creator",
- )
- )
+ fields["contributors"] = self.get_contributors(source_record)
# dates
- fields["dates"] = self.get_dates(source_record_id, xml) or None
+ fields["dates"] = self.get_dates(source_record)
# edition: not set in this transformation
# file_formats: not set in this transformation
# format
- fields["format"] = "electronic resource"
+ fields["format"] = self.get_format()
# funding_information: not set in this transformation
# holdings: not set in this transformation
# identifiers
- fields.setdefault("identifiers", []).append(
- timdex.Identifier(
- value=str(xml.header.identifier.string),
- kind="OAI-PMH",
- )
- )
+ fields["identifiers"] = self.get_identifiers(source_record)
# languages: not set in this transformation
# links
- fields["links"] = self.get_links(source_record_id, xml) or None
+ fields["links"] = self.get_links(source_record)
# literary_form: not set in this transformation
@@ -89,34 +76,31 @@ def get_optional_fields(self, xml: Tag) -> dict | None:
# publication_frequency: not set in this transformation
# publishers
- fields["publishers"] = [
- timdex.Publisher(name=str(p.string))
- for p in xml.find_all("dc:publisher")
- if p.string
- ] or None
+ fields["publishers"] = self.get_publishers(source_record)
# related_items: not set in this transformation
# rights: not set in this transformation
# subjects
- subjects_dict: dict[str, list[str]] = {}
- for subject in xml.metadata.find_all("dc:subject", string=True):
- subjects_dict.setdefault("Subject scheme not provided", []).append(
- str(subject.string)
- )
- fields["subjects"] = [
- timdex.Subject(value=value, kind=key) for key, value in subjects_dict.items()
- ] or None
+ fields["subjects"] = self.get_subjects(source_record)
# summary
- # uses description list retrieved for notes field
- for description in [d for d in xml.find_all("dc:description") if d.string]:
- fields.setdefault("summary", []).append(str(description.string))
-
+ fields["summary"] = self.get_summary(source_record)
return fields
- def get_dates(self, source_record_id: str, xml: Tag) -> list[timdex.Date]:
+ def get_content_type(self) -> list[str]:
+ return [self.source]
+
+ @classmethod
+ def get_contributors(cls, source_record: Tag) -> list[timdex.Contributor] | None:
+ return [
+ timdex.Contributor(value=str(creator.string), kind="Creator")
+ for creator in source_record.find_all("dc:creator", string=True)
+ ] or None
+
+ @classmethod
+ def get_dates(cls, source_record: Tag) -> list[timdex.Date] | None:
"""
Method to get TIMDEX "dates" field. This method broken out to allow subclasses
to override.
@@ -124,45 +108,86 @@ def get_dates(self, source_record_id: str, xml: Tag) -> list[timdex.Date]:
Return list of timdex.Date's if valid and present.
Args:
- source_record_id: Source record id
- xml: A BeautifulSoup Tag representing a single OAI DC XML record.
+ source_record: A BeautifulSoup Tag representing a single OAI DC record in XML.
+
"""
dates = []
- if date_elements := xml.find_all("dc:date", string=True):
- for date in date_elements:
- date_str = str(date.string.strip())
- if validate_date(
- date_str,
- source_record_id,
- ):
- dates.append(timdex.Date(value=date_str, kind="Unknown"))
- return dates
-
- def get_links(self, _source_record_id: str, _xml: Tag) -> list[timdex.Link] | None:
+ source_record_id = cls.get_source_record_id(source_record)
+ for date in source_record.find_all("dc:date", string=True):
+ date_value = str(date.string.strip())
+ if validate_date(date_value, source_record_id):
+ dates.append(timdex.Date(value=date_value, kind="Unknown"))
+ return dates or None
+
+ @classmethod
+ def get_format(cls) -> str:
+ return "electronic resource"
+
+ @classmethod
+ def get_identifiers(cls, source_record: Tag) -> list[timdex.Identifier] | None:
+ identifiers = []
+ if identifier := source_record.header.find("identifier", string=True):
+ identifiers.append(
+ timdex.Identifier(
+ value=str(identifier.string),
+ kind="OAI-PMH",
+ )
+ )
+ return identifiers or None
+
+ def get_links(
+ self,
+ _source_record: Tag,
+ ) -> list[timdex.Link] | None:
"""
Method to get TIMDEX "links" field. This method broken out to allow subclasses
to override.
Args:
- source_record_id: Source record id
- xml: A BeautifulSoup Tag representing a single OAI DC XML record.
+ source_record: A BeautifulSoup Tag representing a single OAI DC record in XML.
"""
- return None
+ return [] or None
@classmethod
- def get_main_titles(cls, xml: Tag) -> list[str]:
+ def get_publishers(cls, source_record: Tag) -> list[timdex.Publisher] | None:
+ return [
+ timdex.Publisher(name=str(publisher.string))
+ for publisher in source_record.find_all("dc:publisher", string=True)
+ ] or None
+
+ @classmethod
+ def get_subjects(cls, source_record: Tag) -> list[timdex.Subject] | None:
+ subjects = [
+ str(subject.string)
+ for subject in source_record.find_all("dc:subject", string=True)
+ ]
+ if subjects:
+ return [timdex.Subject(value=subjects, kind="Subject scheme not provided")]
+ return [] or None
+
+ @classmethod
+ def get_summary(cls, source_record: Tag) -> list[str] | None:
+ return [
+ str(description.string)
+ for description in source_record.find_all("dc:description", string=True)
+ ] or None
+
+ @classmethod
+ def get_main_titles(cls, source_record: Tag) -> list[str]:
"""
Retrieve main title(s) from a generic OAI DC XML record.
Overrides metaclass get_main_titles() method.
Args:
- xml: A BeautifulSoup Tag representing a single OAI DC XML record.
+ source_record: A BeautifulSoup Tag representing a single OAI DC record in XML.
"""
- return [t.string for t in xml.find_all("dc:title", string=True)]
+ return [
+ str(title.string) for title in source_record.find_all("dc:title", string=True)
+ ]
@classmethod
- def get_source_record_id(cls, xml: Tag) -> str:
+ def get_source_record_id(cls, source_record: Tag) -> str:
"""
Use OAI-PMH header identifier. It is anticipated this will likely need to get
overridden by subclasses with a meaningful identifier.
@@ -170,6 +195,12 @@ def get_source_record_id(cls, xml: Tag) -> str:
Overrides metaclass get_source_record_id() method.
Args:
- xml: A BeautifulSoup Tag representing a single OAI DC XML record.
+ source_record: A BeautifulSoup Tag representing a single OAI DC record in XML.
"""
- return xml.header.identifier.string.split(":")[-1]
+ if identifier := source_record.header.find("identifier", string=True):
+ return str(identifier.string).split(":")[-1]
+ message = (
+ "Record skipped because 'source_record_id' could not be derived. "
+ "The 'identifier' was either missing from the header element or blank."
+ )
+ raise SkippedRecordEvent(message)
diff --git a/transmogrifier/sources/xml/springshare.py b/transmogrifier/sources/xml/springshare.py
index b313987..981661b 100644
--- a/transmogrifier/sources/xml/springshare.py
+++ b/transmogrifier/sources/xml/springshare.py
@@ -5,6 +5,7 @@
from dateutil.parser import parse as date_parser
import transmogrifier.models as timdex
+from transmogrifier.exceptions import SkippedRecordEvent
from transmogrifier.helpers import validate_date
from transmogrifier.sources.xml.oaidc import OaiDc
@@ -20,7 +21,8 @@ class SpringshareOaiDc(OaiDc):
- researchdatabases
"""
- def get_dates(self, source_record_id: str, xml: Tag) -> list[timdex.Date]:
+ @classmethod
+ def get_dates(cls, source_record: Tag) -> list[timdex.Date] | None:
"""
Overrides OaiDc's default get_dates() logic for Springshare records.
@@ -31,11 +33,11 @@ def get_dates(self, source_record_id: str, xml: Tag) -> list[timdex.Date]:
Additionally, only a single date will is expected.
Args:
- source_record_id: Source record id
- xml: A BeautifulSoup Tag representing a single OAI DC XML record.
+ source_record: A BeautifulSoup Tag representing a single OAI DC record in XML.
"""
dates = []
- if date := xml.find("dc:date", string=True):
+ source_record_id = cls.get_source_record_id(source_record)
+ if date := source_record.find("dc:date", string=True):
try:
date_iso_str = date_parser(str(date.string).strip()).isoformat()
if validate_date(
@@ -49,18 +51,18 @@ def get_dates(self, source_record_id: str, xml: Tag) -> list[timdex.Date]:
source_record_id,
str(e),
)
- return dates
+ return dates or None
- def get_links(self, source_record_id: str, xml: Tag) -> list[timdex.Link] | None:
+ def get_links(self, source_record: Tag) -> list[timdex.Link] | None:
"""
Overrides OaiDc's default get_links() logic for Springshare records.
Args:
- source_record_id: Source record id
- xml: A BeautifulSoup Tag representing a single OAI DC XML record.
+ source_record: A BeautifulSoup Tag representing a single OAI DC record in XML.
"""
links = []
- if identifier := xml.find("dc:identifier", string=True):
+ source_record_id = self.get_source_record_id(source_record)
+ if identifier := source_record.find("dc:identifier", string=True):
singular_source_name = self.source_name.rstrip("s")
links.append(
timdex.Link(
@@ -69,15 +71,19 @@ def get_links(self, source_record_id: str, xml: Tag) -> list[timdex.Link] | None
url=str(identifier.string),
)
)
+
logger.debug(
"Record ID %s has links that cannot be generated: missing dc:identifier",
source_record_id,
)
- return links
+ return links or None
@classmethod
def get_source_link(
- cls, _source_base_url: str, _source_record_id: str, xml: Tag
+ cls,
+ _source_base_url: str,
+ _source_record_id: str,
+ source_record: Tag,
) -> str:
"""
Override for default source_link behavior.
@@ -99,8 +105,12 @@ def get_source_link(
link.
Args:
- source_base_url: Source base URL.
- source_record_id: Record identifier for the source record.
- xml: A BeautifulSoup Tag representing a single XML record.
+ source_record: A BeautifulSoup Tag representing a single OAI DC record in XML.
"""
- return str(xml.find("dc:identifier").string)
+ if source_link := source_record.find("dc:identifier", string=True):
+ return str(source_link.string)
+ message = (
+ "Record skipped because 'source_link' could not be derived. "
+ "The 'identifier' was either missing from the header element or blank."
+ )
+ raise SkippedRecordEvent(message)