diff --git a/.gitignore b/.gitignore index 38b13f4..8e8d033 100644 --- a/.gitignore +++ b/.gitignore @@ -87,6 +87,9 @@ ipython_config.py # intended to run in multiple environments; otherwise, check them in: # .python-version +# asdf +.tool-versions + # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies diff --git a/tests/conftest.py b/tests/conftest.py index 2785419..0c93b89 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -89,17 +89,11 @@ def marc_content_type_crosswalk(): return load_external_config("config/marc_content_type_crosswalk.json", "json") -# oaidc ########################## - - @pytest.fixture def oai_pmh_records(): return XMLTransformer.parse_source_file("tests/fixtures/oai_pmh_records.xml") -# timdex ########################## - - @pytest.fixture def timdex_record_required_fields(): return timdex.TimdexRecord( diff --git a/tests/fixtures/oai_dc/oaidc_record_valid_generic_date.xml b/tests/fixtures/oai_dc/oaidc_record_valid_generic_date.xml deleted file mode 100644 index a378095..0000000 --- a/tests/fixtures/oai_dc/oaidc_record_valid_generic_date.xml +++ /dev/null @@ -1,26 +0,0 @@ - - - -
- oai:libguides.com:guides/175846 - 2023-05-31T19:49:21Z - guides -
- - - Materials Science & Engineering - Ye Li - Engineering - Science - Useful databases and other research tips for materials science. - MIT Libraries - 2008-06-19T17:55:27 - https://libguides.mit.edu/materials - - -
-
\ No newline at end of file diff --git a/tests/sources/xml/test_oai_dc.py b/tests/sources/xml/test_oai_dc.py index 2b98737..f7ce89e 100644 --- a/tests/sources/xml/test_oai_dc.py +++ b/tests/sources/xml/test_oai_dc.py @@ -1,36 +1,48 @@ +import pytest +from bs4 import BeautifulSoup + import transmogrifier.models as timdex +from transmogrifier.exceptions import SkippedRecordEvent from transmogrifier.sources.xml.oaidc import OaiDc -FIXTURES_PREFIX = "tests/fixtures/oai_dc" - -BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX = timdex.TimdexRecord( - source="LibGuides", - source_link="https://libguides.mit.edu/guides/175846", - timdex_record_id="libguides:guides-175846", - title="Materials Science & Engineering", - citation="Materials Science & Engineering. libguides. " - "https://libguides.mit.edu/guides/175846", - content_type=["libguides"], - format="electronic resource", - identifiers=[ - timdex.Identifier(value="oai:libguides.com:guides/175846", kind="OAI-PMH") - ], -) +def create_oaidc_source_record_stub( + header_insert: str = "", metadata_insert: str = "" +) -> BeautifulSoup: + + xml_str = f""" + + +
+ {header_insert} +
+ + + {metadata_insert} + + +
+ Ye Li + """ + ) + ) + assert OaiDc.get_contributors(source_record) == [ + timdex.Contributor( + value="Ye Li", + kind="Creator", + ) + ] + + +def test_get_contributors_transforms_correctly_if_fields_blank(): + source_record = create_oaidc_source_record_stub( + metadata_insert=( + """ + + """ + ) + ) + assert OaiDc.get_contributors(source_record) is None + + +def test_get_contributors_transforms_correctly_if_fields_missing(): + source_record = create_oaidc_source_record_stub() + assert OaiDc.get_contributors(source_record) is None + + +def test_get_dates_success(): + source_record = create_oaidc_source_record_stub( + header_insert=( + """ + oai:libguides.com:guides/175846 + """ + ), + metadata_insert=( + """ + 2008-06-19T17:55:27 + """ + ), + ) + assert OaiDc.get_dates(source_record) == [ + timdex.Date(kind="Unknown", value="2008-06-19T17:55:27") + ] + + +def test_get_dates_transforms_correctly_if_fields_blank(): + source_record = create_oaidc_source_record_stub( + header_insert=( + """ + oai:libguides.com:guides/175846 + """ + ), + metadata_insert=( + """ + + """ + ), + ) + assert OaiDc.get_dates(source_record) is None + + +def test_get_dates_transforms_correctly_if_fields_missing(): + source_record = create_oaidc_source_record_stub( + header_insert=( + """ + oai:libguides.com:guides/175846 + """ + ) + ) + assert OaiDc.get_dates(source_record) is None + + +def test_get_dates_transforms_correctly_if_date_invalid(): + source_record = create_oaidc_source_record_stub( + header_insert=( + """ + oai:libguides.com:guides/175846 + """ + ), + metadata_insert=( + """ + INVALID + """ + ), + ) + assert OaiDc.get_dates(source_record) is None + + +def test_get_identifiers_success(): + source_record = create_oaidc_source_record_stub( + header_insert=( + """ + oai:libguides.com:guides/175846 + """ + ) ) - transformer_instance = OaiDc("libguides", source_records) - xml = next(transformer_instance.source_records) - assert transformer_instance.get_dates("test_source_record_id", xml) == [ - timdex.Date(kind="Unknown", note=None, range=None, value="2008-06-19T17:55:27") + assert OaiDc.get_identifiers(source_record) == [ + timdex.Identifier(value="oai:libguides.com:guides/175846", kind="OAI-PMH") ] + + +def test_get_identifiers_transforms_correctly_if_fields_blank(): + source_record = create_oaidc_source_record_stub( + header_insert=( + """ + + """ + ) + ) + assert OaiDc.get_identifiers(source_record) is None + + +def test_get_identifiers_transforms_correctly_if_fields_missing(): + source_record = create_oaidc_source_record_stub() + assert OaiDc.get_identifiers(source_record) is None + + +def test_get_publishers_success(): + source_record = create_oaidc_source_record_stub( + metadata_insert=( + """ + MIT Libraries + """ + ) + ) + assert OaiDc.get_publishers(source_record) == [timdex.Publisher(name="MIT Libraries")] + + +def test_get_publishers_transforms_correctly_if_fields_blank(): + source_record = create_oaidc_source_record_stub( + metadata_insert=( + """ + + """ + ) + ) + assert OaiDc.get_publishers(source_record) is None + + +def test_get_publishers_transforms_correctly_if_fields_missing(): + source_record = create_oaidc_source_record_stub() + assert OaiDc.get_publishers(source_record) is None + + +def test_get_subjects_success(): + source_record = create_oaidc_source_record_stub( + metadata_insert=( + """ + Engineering + Science + """ + ) + ) + assert OaiDc.get_subjects(source_record) == [ + timdex.Subject( + value=["Engineering", "Science"], kind="Subject scheme not provided" + ) + ] + + +def test_get_subjects_transforms_correctly_if_fields_blank(): + source_record = create_oaidc_source_record_stub( + metadata_insert=( + """ + + """ + ) + ) + assert OaiDc.get_subjects(source_record) is None + + +def test_get_subjects_transforms_correctly_if_fields_missing(): + source_record = create_oaidc_source_record_stub() + assert OaiDc.get_subjects(source_record) is None + + +def test_get_summary_success(): + source_record = create_oaidc_source_record_stub( + metadata_insert=( + "" + "Useful databases and other research tips for materials science." + "" + ) + ) + assert OaiDc.get_summary(source_record) == [ + "Useful databases and other research tips for materials science." + ] + + +def test_get_summary_transforms_properly_if_fields_blank(): + source_record = create_oaidc_source_record_stub( + metadata_insert=( + """ + + """ + ) + ) + assert OaiDc.get_summary(source_record) is None + + +def test_get_summary_transforms_properly_if_fields_missing(): + source_record = create_oaidc_source_record_stub() + assert OaiDc.get_summary(source_record) is None + + +def test_get_main_titles_success(): + source_record = create_oaidc_source_record_stub( + metadata_insert=( + """ + Materials Science & Engineering + """ + ) + ) + assert OaiDc.get_main_titles(source_record) == ["Materials Science & Engineering"] + + +def test_get_main_titles_transforms_properly_if_fields_blank(): + source_record = create_oaidc_source_record_stub( + metadata_insert=( + """ + + """ + ) + ) + assert OaiDc.get_main_titles(source_record) == [] + + +def test_get_main_titles_transforms_properly_if_fields_missing(): + source_record = create_oaidc_source_record_stub() + assert OaiDc.get_main_titles(source_record) == [] + + +def test_get_source_record_id_success(): + source_record = create_oaidc_source_record_stub( + header_insert=( + """ + oai:libguides.com:guides/175846 + """ + ) + ) + assert OaiDc.get_source_record_id(source_record) == "guides/175846" + + +def test_get_source_record_id_raises_skipped_record_event_if_fields_blank(): + source_record = create_oaidc_source_record_stub( + header_insert=( + """ + + """ + ) + ) + with pytest.raises( + SkippedRecordEvent, + match=( + "Record skipped because 'source_record_id' could not be derived. " + "The 'identifier' was either missing from the header element or blank." + ), + ): + OaiDc.get_source_record_id(source_record) + + +def test_get_source_record_id_raises_skipped_record_event_if_fields_missing(): + source_record = create_oaidc_source_record_stub() + with pytest.raises( + SkippedRecordEvent, + match=( + "Record skipped because 'source_record_id' could not be derived. " + "The 'identifier' was either missing from the header element or blank." + ), + ): + OaiDc.get_source_record_id(source_record) diff --git a/tests/sources/xml/test_springshare.py b/tests/sources/xml/test_springshare.py index f02c96f..56973d6 100644 --- a/tests/sources/xml/test_springshare.py +++ b/tests/sources/xml/test_springshare.py @@ -1,97 +1,222 @@ +# ruff: noqa: E501 import logging +import pytest +from bs4 import BeautifulSoup + import transmogrifier.models as timdex +from transmogrifier.exceptions import SkippedRecordEvent from transmogrifier.sources.xml.springshare import SpringshareOaiDc SPRINGSHARE_FIXTURES_PREFIX = "tests/fixtures/oai_dc/springshare" - LIBGUIDES_FIXTURES_PREFIX = f"{SPRINGSHARE_FIXTURES_PREFIX}/libguides" - RESEARCHDATABASES_FIXTURES_PREFIX = f"{SPRINGSHARE_FIXTURES_PREFIX}/research_databases" -LIBGUIDES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX = timdex.TimdexRecord( - source="LibGuides", - source_link="https://libguides.mit.edu/materials", - timdex_record_id="libguides:guides-175846", - title="Materials Science & Engineering", - citation="Materials Science & Engineering. libguides. " - "https://libguides.mit.edu/materials", - content_type=["libguides"], - format="electronic resource", - identifiers=[ - timdex.Identifier(value="oai:libguides.com:guides/175846", kind="OAI-PMH") - ], - links=[ +def create_oaidc_source_record_stub( + header_insert: str = "", metadata_insert: str = "" +) -> BeautifulSoup: + + xml_str = f""" + + +
+ {header_insert} +
+ + + {metadata_insert} + + +
+ oai:libguides.com:guides/175846 + """ + ), + metadata_insert=( + """ + January 1st, 2000 + """ + ), + ) + assert SpringshareOaiDc.get_dates(source_record) == [ + timdex.Date(kind="Created", value="2000-01-01T00:00:00") + ] + + +def test_get_dates_transforms_correctly_if_optional_fields_blank(): + source_record = create_oaidc_source_record_stub( + header_insert=( + """ + oai:libguides.com:guides/175846 + """ + ), + metadata_insert=( + """ + + """ + ), + ) + assert SpringshareOaiDc.get_dates(source_record) is None + + +def test_get_dates_transforms_correctly_if_optional_fields_missing(): + source_record = create_oaidc_source_record_stub( + header_insert=( + """ + oai:libguides.com:guides/175846 + """ + ) + ) + assert SpringshareOaiDc.get_dates(source_record) is None + + +def test_get_dates_transforms_correctly_and_logs_error_if_date_invalid( + caplog, +): + caplog.set_level(logging.DEBUG) + source_record = create_oaidc_source_record_stub( + header_insert=( + """ + oai:libguides.com:guides/175846 + """ + ), + metadata_insert=( + """ + INVALID + """ + ), + ) + assert SpringshareOaiDc.get_dates(source_record) is None + assert ( + "Record ID guides/175846 has a date that cannot be parsed: Unknown string format: INVALID" + in caplog.text + ) + + +def test_get_links_success(): + source_record = create_oaidc_source_record_stub( + header_insert=( + """ + oai:libguides.com:guides/175846 + """ + ), + metadata_insert=( + """ + https://libguides.mit.edu/materials + """ + ), + ) + assert SpringshareOaiDc("libguides", iter(())).get_links( + source_record=source_record + ) == [ timdex.Link( - url="https://libguides.mit.edu/materials", kind="LibGuide URL", text="LibGuide URL", + url="https://libguides.mit.edu/materials", ) - ], -) - -RESEARCHDATABASES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX = timdex.TimdexRecord( - source="Research Databases", - source_link="https://libguides.mit.edu/llba", - timdex_record_id="researchdatabases:az-65257807", - title="Linguistics and Language Behavior Abstracts (LLBA)", - citation="Linguistics and Language Behavior Abstracts (LLBA). researchdatabases. " - "https://libguides.mit.edu/llba", - content_type=["researchdatabases"], - format="electronic resource", - identifiers=[ - timdex.Identifier(value="oai:libguides.com:az/65257807", kind="OAI-PMH") - ], - links=[ - timdex.Link( - url="https://libguides.mit.edu/llba", - kind="Research Database URL", - text="Research Database URL", + ] + + +def test_get_links_transforms_correctly_if_required_fields_blank(): + source_record = create_oaidc_source_record_stub( + header_insert=( + """ + oai:libguides.com:guides/175846 + """ + ), + metadata_insert=( + """ + + """ + ), + ) + assert SpringshareOaiDc("libguides", iter(())).get_links(source_record) is None + + +def test_get_links_transforms_correctly_if_required_fields_missing(): + source_record = create_oaidc_source_record_stub( + header_insert=( + """ + oai:libguides.com:guides/175846 + """ ) - ], -) + ) + assert SpringshareOaiDc("libguides", iter(())).get_links(source_record) is None -def test_springshare_get_dates_valid(): - source_records = SpringshareOaiDc.parse_source_file( - f"{SPRINGSHARE_FIXTURES_PREFIX}/springshare_valid_dates.xml" - ) - transformer_instance = SpringshareOaiDc("libguides", source_records) - for xml in transformer_instance.source_records: - date_field_value = transformer_instance.get_dates("test_get_dates", xml) - assert date_field_value == [ - timdex.Date( - kind="Created", note=None, range=None, value="2000-01-01T00:00:00" - ) - ] +def test_get_source_link_success(): + source_record = create_oaidc_source_record_stub( + metadata_insert=( + """ + https://libguides.mit.edu/materials + """ + ) + ) + assert ( + SpringshareOaiDc.get_source_link("", "", source_record) + == "https://libguides.mit.edu/materials" + ) -def test_springshare_get_dates_invalid_logged_and_skipped(caplog): - caplog.set_level(logging.DEBUG) - source_records = SpringshareOaiDc.parse_source_file( - f"{SPRINGSHARE_FIXTURES_PREFIX}/springshare_invalid_dates.xml" +def test_get_source_link_raises_skipped_record_event_if_required_fields_blank(): + source_record = create_oaidc_source_record_stub( + metadata_insert=( + """ + + """ + ) ) - transformer_instance = SpringshareOaiDc("libguides", source_records) - for xml in transformer_instance.source_records: - date_field_value = transformer_instance.get_dates("test_get_dates", xml) - assert date_field_value == [] - assert "has a date that cannot be parsed" in caplog.text + with pytest.raises( + SkippedRecordEvent, + match=( + "Record skipped because 'source_link' could not be derived. " + "The 'identifier' was either missing from the header element or blank." + ), + ): + SpringshareOaiDc.get_source_link("", "", source_record) -def test_springshare_get_links_missing_identifier_logged_and_skipped(caplog): - caplog.set_level(logging.DEBUG) - source_records = SpringshareOaiDc.parse_source_file( - f"{SPRINGSHARE_FIXTURES_PREFIX}/springshare_record_missing_required_fields.xml" +def test_get_source_link_raises_skipped_record_event_if_required_fields_missing(): + source_record = create_oaidc_source_record_stub( + metadata_insert=( + """ + + """ + ) ) - transformer_instance = SpringshareOaiDc("libguides", source_records) - for xml in transformer_instance.source_records: - links_field_value = transformer_instance.get_links("test_get_links", xml) - assert links_field_value == [] - assert "has links that cannot be generated" in caplog.text + with pytest.raises( + SkippedRecordEvent, + match=( + "Record skipped because 'source_link' could not be derived. " + "The 'identifier' was either missing from the header element or blank." + ), + ): + SpringshareOaiDc.get_source_link("", "", source_record) + +########################### +# Springshare - LibGuides +########################### -def test_libguide_transform_with_all_fields_transforms_correctly(): + +def test_springshare_libguides_transform_with_all_fields_transforms_correctly(): source_records = SpringshareOaiDc.parse_source_file( f"{LIBGUIDES_FIXTURES_PREFIX}/libguides_record_all_fields.xml" ) @@ -135,23 +260,66 @@ def test_libguide_transform_with_all_fields_transforms_correctly(): ) -def test_libguides_transform_with_optional_fields_blank_transforms_correctly(): +def test_springshare_libguides_transform_with_optional_fields_blank_transforms_correctly(): source_records = SpringshareOaiDc.parse_source_file( f"{LIBGUIDES_FIXTURES_PREFIX}/libguides_record_optional_fields_blank.xml" ) output_records = SpringshareOaiDc("libguides", source_records) - assert next(output_records) == LIBGUIDES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX + assert next(output_records) == timdex.TimdexRecord( + source="LibGuides", + source_link="https://libguides.mit.edu/materials", + timdex_record_id="libguides:guides-175846", + title="Materials Science & Engineering", + citation="Materials Science & Engineering. libguides. " + "https://libguides.mit.edu/materials", + content_type=["libguides"], + format="electronic resource", + identifiers=[ + timdex.Identifier(value="oai:libguides.com:guides/175846", kind="OAI-PMH") + ], + links=[ + timdex.Link( + url="https://libguides.mit.edu/materials", + kind="LibGuide URL", + text="LibGuide URL", + ) + ], + ) -def test_libguides_transform_with_optional_fields_missing_transforms_correctly(): +def test_springshare_libguides_transform_with_optional_fields_missing_transforms_correctly(): source_records = SpringshareOaiDc.parse_source_file( f"{LIBGUIDES_FIXTURES_PREFIX}/libguides_record_optional_fields_missing.xml" ) output_records = SpringshareOaiDc("libguides", source_records) - assert next(output_records) == LIBGUIDES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX + assert next(output_records) == timdex.TimdexRecord( + source="LibGuides", + source_link="https://libguides.mit.edu/materials", + timdex_record_id="libguides:guides-175846", + title="Materials Science & Engineering", + citation="Materials Science & Engineering. libguides. " + "https://libguides.mit.edu/materials", + content_type=["libguides"], + format="electronic resource", + identifiers=[ + timdex.Identifier(value="oai:libguides.com:guides/175846", kind="OAI-PMH") + ], + links=[ + timdex.Link( + url="https://libguides.mit.edu/materials", + kind="LibGuide URL", + text="LibGuide URL", + ) + ], + ) -def test_research_databases_transform_with_all_fields_transforms_correctly(): +#################################### +# Springshare - Research Databases +#################################### + + +def test_springshare_research_databases_transform_with_all_fields_transforms_correctly(): source_records = SpringshareOaiDc.parse_source_file( f"{RESEARCHDATABASES_FIXTURES_PREFIX}/research_databases_record_all_fields.xml" ) @@ -191,23 +359,57 @@ def test_research_databases_transform_with_all_fields_transforms_correctly(): ) -def test_research_databases_transform_with_optional_fields_blank_transforms_correctly(): +def test_springshare_research_databases_transform_with_optional_fields_blank_transforms_correctly(): source_records = SpringshareOaiDc.parse_source_file( RESEARCHDATABASES_FIXTURES_PREFIX + "/research_databases_record_optional_fields_blank.xml" ) output_records = SpringshareOaiDc("researchdatabases", source_records) - assert ( - next(output_records) == RESEARCHDATABASES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX + assert next(output_records) == timdex.TimdexRecord( + source="Research Databases", + source_link="https://libguides.mit.edu/llba", + timdex_record_id="researchdatabases:az-65257807", + title="Linguistics and Language Behavior Abstracts (LLBA)", + citation="Linguistics and Language Behavior Abstracts (LLBA). researchdatabases. " + "https://libguides.mit.edu/llba", + content_type=["researchdatabases"], + format="electronic resource", + identifiers=[ + timdex.Identifier(value="oai:libguides.com:az/65257807", kind="OAI-PMH") + ], + links=[ + timdex.Link( + url="https://libguides.mit.edu/llba", + kind="Research Database URL", + text="Research Database URL", + ) + ], ) -def test_research_databases_transform_with_optional_fields_missing_transforms_correctly(): +def test_springshare_research_databases_transform_with_optional_fields_missing_transforms_correctly(): source_records = SpringshareOaiDc.parse_source_file( RESEARCHDATABASES_FIXTURES_PREFIX + "/research_databases_record_optional_fields_missing.xml" ) output_records = SpringshareOaiDc("researchdatabases", source_records) - assert ( - next(output_records) == RESEARCHDATABASES_BLANK_OR_MISSING_OPTIONAL_FIELDS_TIMDEX + assert next(output_records) == timdex.TimdexRecord( + source="Research Databases", + source_link="https://libguides.mit.edu/llba", + timdex_record_id="researchdatabases:az-65257807", + title="Linguistics and Language Behavior Abstracts (LLBA)", + citation="Linguistics and Language Behavior Abstracts (LLBA). researchdatabases. " + "https://libguides.mit.edu/llba", + content_type=["researchdatabases"], + format="electronic resource", + identifiers=[ + timdex.Identifier(value="oai:libguides.com:az/65257807", kind="OAI-PMH") + ], + links=[ + timdex.Link( + url="https://libguides.mit.edu/llba", + kind="Research Database URL", + text="Research Database URL", + ) + ], ) diff --git a/transmogrifier/sources/xml/oaidc.py b/transmogrifier/sources/xml/oaidc.py index 08c2ae3..952eb8f 100644 --- a/transmogrifier/sources/xml/oaidc.py +++ b/transmogrifier/sources/xml/oaidc.py @@ -3,6 +3,7 @@ from bs4 import Tag # type: ignore[import-untyped] import transmogrifier.models as timdex +from transmogrifier.exceptions import SkippedRecordEvent from transmogrifier.helpers import validate_date from transmogrifier.sources.xmltransformer import XMLTransformer @@ -17,18 +18,15 @@ class OaiDc(XMLTransformer): anticipated this will most likely get extended by a source-specific transformer. """ - def get_optional_fields(self, xml: Tag) -> dict | None: + def get_optional_fields(self, source_record: Tag) -> dict | None: """ Retrieve optional TIMDEX fields from a generic OAI DC XML record. Args: - xml: A BeautifulSoup Tag representing a single OAI DC XML record + source_record: A BeautifulSoup Tag representing a single OAI DC record in XML. """ fields: dict = {} - # extract source_record_id early for use and logging - source_record_id = self.get_source_record_id(xml) - # alternate_titles: not set in this transformation # call_numbers: not set in this transformation @@ -36,45 +34,34 @@ def get_optional_fields(self, xml: Tag) -> dict | None: # citation: uses fallback get_citation() method # content_type - fields["content_type"] = [self.source] + fields["content_type"] = self.get_content_type() # contents: not set in this transformation # contributors - for creator in [c for c in xml.find_all("dc:creator") if c.string]: - fields.setdefault("contributors", []).append( - timdex.Contributor( - value=str(creator.string), - kind="Creator", - ) - ) + fields["contributors"] = self.get_contributors(source_record) # dates - fields["dates"] = self.get_dates(source_record_id, xml) or None + fields["dates"] = self.get_dates(source_record) # edition: not set in this transformation # file_formats: not set in this transformation # format - fields["format"] = "electronic resource" + fields["format"] = self.get_format() # funding_information: not set in this transformation # holdings: not set in this transformation # identifiers - fields.setdefault("identifiers", []).append( - timdex.Identifier( - value=str(xml.header.identifier.string), - kind="OAI-PMH", - ) - ) + fields["identifiers"] = self.get_identifiers(source_record) # languages: not set in this transformation # links - fields["links"] = self.get_links(source_record_id, xml) or None + fields["links"] = self.get_links(source_record) # literary_form: not set in this transformation @@ -89,34 +76,31 @@ def get_optional_fields(self, xml: Tag) -> dict | None: # publication_frequency: not set in this transformation # publishers - fields["publishers"] = [ - timdex.Publisher(name=str(p.string)) - for p in xml.find_all("dc:publisher") - if p.string - ] or None + fields["publishers"] = self.get_publishers(source_record) # related_items: not set in this transformation # rights: not set in this transformation # subjects - subjects_dict: dict[str, list[str]] = {} - for subject in xml.metadata.find_all("dc:subject", string=True): - subjects_dict.setdefault("Subject scheme not provided", []).append( - str(subject.string) - ) - fields["subjects"] = [ - timdex.Subject(value=value, kind=key) for key, value in subjects_dict.items() - ] or None + fields["subjects"] = self.get_subjects(source_record) # summary - # uses description list retrieved for notes field - for description in [d for d in xml.find_all("dc:description") if d.string]: - fields.setdefault("summary", []).append(str(description.string)) - + fields["summary"] = self.get_summary(source_record) return fields - def get_dates(self, source_record_id: str, xml: Tag) -> list[timdex.Date]: + def get_content_type(self) -> list[str]: + return [self.source] + + @classmethod + def get_contributors(cls, source_record: Tag) -> list[timdex.Contributor] | None: + return [ + timdex.Contributor(value=str(creator.string), kind="Creator") + for creator in source_record.find_all("dc:creator", string=True) + ] or None + + @classmethod + def get_dates(cls, source_record: Tag) -> list[timdex.Date] | None: """ Method to get TIMDEX "dates" field. This method broken out to allow subclasses to override. @@ -124,45 +108,86 @@ def get_dates(self, source_record_id: str, xml: Tag) -> list[timdex.Date]: Return list of timdex.Date's if valid and present. Args: - source_record_id: Source record id - xml: A BeautifulSoup Tag representing a single OAI DC XML record. + source_record: A BeautifulSoup Tag representing a single OAI DC record in XML. + """ dates = [] - if date_elements := xml.find_all("dc:date", string=True): - for date in date_elements: - date_str = str(date.string.strip()) - if validate_date( - date_str, - source_record_id, - ): - dates.append(timdex.Date(value=date_str, kind="Unknown")) - return dates - - def get_links(self, _source_record_id: str, _xml: Tag) -> list[timdex.Link] | None: + source_record_id = cls.get_source_record_id(source_record) + for date in source_record.find_all("dc:date", string=True): + date_value = str(date.string.strip()) + if validate_date(date_value, source_record_id): + dates.append(timdex.Date(value=date_value, kind="Unknown")) + return dates or None + + @classmethod + def get_format(cls) -> str: + return "electronic resource" + + @classmethod + def get_identifiers(cls, source_record: Tag) -> list[timdex.Identifier] | None: + identifiers = [] + if identifier := source_record.header.find("identifier", string=True): + identifiers.append( + timdex.Identifier( + value=str(identifier.string), + kind="OAI-PMH", + ) + ) + return identifiers or None + + def get_links( + self, + _source_record: Tag, + ) -> list[timdex.Link] | None: """ Method to get TIMDEX "links" field. This method broken out to allow subclasses to override. Args: - source_record_id: Source record id - xml: A BeautifulSoup Tag representing a single OAI DC XML record. + source_record: A BeautifulSoup Tag representing a single OAI DC record in XML. """ - return None + return [] or None @classmethod - def get_main_titles(cls, xml: Tag) -> list[str]: + def get_publishers(cls, source_record: Tag) -> list[timdex.Publisher] | None: + return [ + timdex.Publisher(name=str(publisher.string)) + for publisher in source_record.find_all("dc:publisher", string=True) + ] or None + + @classmethod + def get_subjects(cls, source_record: Tag) -> list[timdex.Subject] | None: + subjects = [ + str(subject.string) + for subject in source_record.find_all("dc:subject", string=True) + ] + if subjects: + return [timdex.Subject(value=subjects, kind="Subject scheme not provided")] + return [] or None + + @classmethod + def get_summary(cls, source_record: Tag) -> list[str] | None: + return [ + str(description.string) + for description in source_record.find_all("dc:description", string=True) + ] or None + + @classmethod + def get_main_titles(cls, source_record: Tag) -> list[str]: """ Retrieve main title(s) from a generic OAI DC XML record. Overrides metaclass get_main_titles() method. Args: - xml: A BeautifulSoup Tag representing a single OAI DC XML record. + source_record: A BeautifulSoup Tag representing a single OAI DC record in XML. """ - return [t.string for t in xml.find_all("dc:title", string=True)] + return [ + str(title.string) for title in source_record.find_all("dc:title", string=True) + ] @classmethod - def get_source_record_id(cls, xml: Tag) -> str: + def get_source_record_id(cls, source_record: Tag) -> str: """ Use OAI-PMH header identifier. It is anticipated this will likely need to get overridden by subclasses with a meaningful identifier. @@ -170,6 +195,12 @@ def get_source_record_id(cls, xml: Tag) -> str: Overrides metaclass get_source_record_id() method. Args: - xml: A BeautifulSoup Tag representing a single OAI DC XML record. + source_record: A BeautifulSoup Tag representing a single OAI DC record in XML. """ - return xml.header.identifier.string.split(":")[-1] + if identifier := source_record.header.find("identifier", string=True): + return str(identifier.string).split(":")[-1] + message = ( + "Record skipped because 'source_record_id' could not be derived. " + "The 'identifier' was either missing from the header element or blank." + ) + raise SkippedRecordEvent(message) diff --git a/transmogrifier/sources/xml/springshare.py b/transmogrifier/sources/xml/springshare.py index b313987..981661b 100644 --- a/transmogrifier/sources/xml/springshare.py +++ b/transmogrifier/sources/xml/springshare.py @@ -5,6 +5,7 @@ from dateutil.parser import parse as date_parser import transmogrifier.models as timdex +from transmogrifier.exceptions import SkippedRecordEvent from transmogrifier.helpers import validate_date from transmogrifier.sources.xml.oaidc import OaiDc @@ -20,7 +21,8 @@ class SpringshareOaiDc(OaiDc): - researchdatabases """ - def get_dates(self, source_record_id: str, xml: Tag) -> list[timdex.Date]: + @classmethod + def get_dates(cls, source_record: Tag) -> list[timdex.Date] | None: """ Overrides OaiDc's default get_dates() logic for Springshare records. @@ -31,11 +33,11 @@ def get_dates(self, source_record_id: str, xml: Tag) -> list[timdex.Date]: Additionally, only a single date will is expected. Args: - source_record_id: Source record id - xml: A BeautifulSoup Tag representing a single OAI DC XML record. + source_record: A BeautifulSoup Tag representing a single OAI DC record in XML. """ dates = [] - if date := xml.find("dc:date", string=True): + source_record_id = cls.get_source_record_id(source_record) + if date := source_record.find("dc:date", string=True): try: date_iso_str = date_parser(str(date.string).strip()).isoformat() if validate_date( @@ -49,18 +51,18 @@ def get_dates(self, source_record_id: str, xml: Tag) -> list[timdex.Date]: source_record_id, str(e), ) - return dates + return dates or None - def get_links(self, source_record_id: str, xml: Tag) -> list[timdex.Link] | None: + def get_links(self, source_record: Tag) -> list[timdex.Link] | None: """ Overrides OaiDc's default get_links() logic for Springshare records. Args: - source_record_id: Source record id - xml: A BeautifulSoup Tag representing a single OAI DC XML record. + source_record: A BeautifulSoup Tag representing a single OAI DC record in XML. """ links = [] - if identifier := xml.find("dc:identifier", string=True): + source_record_id = self.get_source_record_id(source_record) + if identifier := source_record.find("dc:identifier", string=True): singular_source_name = self.source_name.rstrip("s") links.append( timdex.Link( @@ -69,15 +71,19 @@ def get_links(self, source_record_id: str, xml: Tag) -> list[timdex.Link] | None url=str(identifier.string), ) ) + logger.debug( "Record ID %s has links that cannot be generated: missing dc:identifier", source_record_id, ) - return links + return links or None @classmethod def get_source_link( - cls, _source_base_url: str, _source_record_id: str, xml: Tag + cls, + _source_base_url: str, + _source_record_id: str, + source_record: Tag, ) -> str: """ Override for default source_link behavior. @@ -99,8 +105,12 @@ def get_source_link( link. Args: - source_base_url: Source base URL. - source_record_id: Record identifier for the source record. - xml: A BeautifulSoup Tag representing a single XML record. + source_record: A BeautifulSoup Tag representing a single OAI DC record in XML. """ - return str(xml.find("dc:identifier").string) + if source_link := source_record.find("dc:identifier", string=True): + return str(source_link.string) + message = ( + "Record skipped because 'source_link' could not be derived. " + "The 'identifier' was either missing from the header element or blank." + ) + raise SkippedRecordEvent(message)