From 43dba2112f6a5a7d62f367f32f8708592dc1feef Mon Sep 17 00:00:00 2001 From: Eric Hanson Date: Thu, 16 Nov 2023 11:27:58 -0500 Subject: [PATCH] Rename Transformer > XmlTransformer * Rename Transformer > XmlTransformer across repo to prepare for the creation of Transformer metaclass --- tests/test_transformer.py | 34 ++++++------- transmogrifier/helpers.py | 6 +-- transmogrifier/sources/datacite.py | 4 +- transmogrifier/sources/dspace_dim.py | 4 +- transmogrifier/sources/dspace_mets.py | 4 +- transmogrifier/sources/ead.py | 4 +- transmogrifier/sources/marc.py | 4 +- transmogrifier/sources/oaidc.py | 4 +- transmogrifier/sources/transformer.py | 73 +++++++++++++++------------ 9 files changed, 73 insertions(+), 64 deletions(-) diff --git a/tests/test_transformer.py b/tests/test_transformer.py index 57a4985..66ce00a 100644 --- a/tests/test_transformer.py +++ b/tests/test_transformer.py @@ -3,33 +3,33 @@ from transmogrifier.helpers import parse_xml_records from transmogrifier.models import TimdexRecord from transmogrifier.sources.datacite import Datacite -from transmogrifier.sources.transformer import Transformer +from transmogrifier.sources.transformer import XmlTransformer -def test_transformer_initializes_with_expected_attributes(oai_pmh_records): - transformer = Transformer("cool-repo", oai_pmh_records) +def test_xmltransformer_initializes_with_expected_attributes(oai_pmh_records): + transformer = XmlTransformer("cool-repo", oai_pmh_records) assert transformer.source == "cool-repo" assert transformer.source_base_url == "https://example.com/" assert transformer.source_name == "A Cool Repository" assert transformer.input_records == oai_pmh_records -def test_transformer_iterates_through_all_records(oai_pmh_records): - output_records = Transformer("cool-repo", oai_pmh_records) +def test_xmltransformer_iterates_through_all_records(oai_pmh_records): + output_records = XmlTransformer("cool-repo", oai_pmh_records) assert len(list(output_records)) == 2 assert output_records.processed_record_count == 3 assert output_records.transformed_record_count == 2 assert len(output_records.deleted_records) == 1 -def test_transformer_iterates_successfully_if_get_optional_fields_returns_none( +def test_xmltransformer_iterates_successfully_if_get_optional_fields_returns_none( oai_pmh_records, ): with patch( - "transmogrifier.sources.transformer.Transformer.get_optional_fields" + "transmogrifier.sources.transformer.XmlTransformer.get_optional_fields" ) as m: m.return_value = None - output_records = Transformer("cool-repo", oai_pmh_records) + output_records = XmlTransformer("cool-repo", oai_pmh_records) assert len(list(output_records)) == 0 assert output_records.processed_record_count == 3 assert output_records.skipped_record_count == 2 @@ -37,13 +37,13 @@ def test_transformer_iterates_successfully_if_get_optional_fields_returns_none( assert len(output_records.deleted_records) == 1 -def test_transformer_record_is_deleted_returns_true_if_deleted(caplog): +def test_xmltransformer_record_is_deleted_returns_true_if_deleted(caplog): input_records = parse_xml_records("tests/fixtures/record_deleted.xml") - assert Transformer.record_is_deleted(next(input_records)) is True + assert XmlTransformer.record_is_deleted(next(input_records)) is True -def test_transformer_get_required_fields_returns_expected_values(oai_pmh_records): - transformer = Transformer("cool-repo", oai_pmh_records) +def test_xmltransformer_get_required_fields_returns_expected_values(oai_pmh_records): + transformer = XmlTransformer("cool-repo", oai_pmh_records) assert transformer.get_required_fields(next(oai_pmh_records)) == { "source": "A Cool Repository", "source_link": "https://example.com/12345", @@ -52,8 +52,8 @@ def test_transformer_get_required_fields_returns_expected_values(oai_pmh_records } -def test_transformer_transform_returns_timdex_record(oai_pmh_records): - transformer = Transformer("cool-repo", oai_pmh_records) +def test_xmltransformer_transform_returns_timdex_record(oai_pmh_records): + transformer = XmlTransformer("cool-repo", oai_pmh_records) assert next(transformer) == TimdexRecord( source="A Cool Repository", source_link="https://example.com/12345", @@ -64,7 +64,7 @@ def test_transformer_transform_returns_timdex_record(oai_pmh_records): ) -def test_get_valid_title_with_title_field_blank_logs_warning(caplog): +def test_xmltransformer_get_valid_title_with_title_field_blank_logs_warning(caplog): input_records = parse_xml_records("tests/fixtures/record_title_field_blank.xml") output_records = Datacite("cool-repo", input_records) assert next(output_records).title == "Title not provided" @@ -74,7 +74,7 @@ def test_get_valid_title_with_title_field_blank_logs_warning(caplog): ) -def test_get_valid_title_with_title_field_missing_logs_warning(caplog): +def test_xmltransformer_get_valid_title_with_title_field_missing_logs_warning(caplog): input_records = parse_xml_records("tests/fixtures/record_title_field_missing.xml") output_records = Datacite("cool-repo", input_records) assert next(output_records).title == "Title not provided" @@ -84,7 +84,7 @@ def test_get_valid_title_with_title_field_missing_logs_warning(caplog): ) -def test_get_valid_title_with_title_field_multiple_logs_warning(caplog): +def test_xmltransformer_get_valid_title_with_title_field_multiple_logs_warning(caplog): input_records = parse_xml_records("tests/fixtures/record_title_field_multiple.xml") output_records = Datacite("cool-repo", input_records) assert ( diff --git a/transmogrifier/helpers.py b/transmogrifier/helpers.py index 7a4fdc3..a48c5b2 100644 --- a/transmogrifier/helpers.py +++ b/transmogrifier/helpers.py @@ -16,9 +16,9 @@ from transmogrifier.config import DATE_FORMATS from transmogrifier.models import TimdexRecord -# import Transformer only when type checking to avoid circular dependency +# import XmlTransformer only when type checking to avoid circular dependency if TYPE_CHECKING: # pragma: no cover - from transmogrifier.sources.transformer import Transformer + from transmogrifier.sources.transformer import XmlTransformer logger = logging.getLogger(__name__) @@ -177,7 +177,7 @@ def write_deleted_records_to_file(deleted_records: list[str], output_file_path: def write_timdex_records_to_json( - transformer_instance: "Transformer", output_file_path: str + transformer_instance: "XmlTransformer", output_file_path: str ) -> int: count = 0 try: diff --git a/transmogrifier/sources/datacite.py b/transmogrifier/sources/datacite.py index d7e2f94..1aee2aa 100644 --- a/transmogrifier/sources/datacite.py +++ b/transmogrifier/sources/datacite.py @@ -5,12 +5,12 @@ import transmogrifier.models as timdex from transmogrifier.helpers import validate_date, validate_date_range -from transmogrifier.sources.transformer import Transformer +from transmogrifier.sources.transformer import XmlTransformer logger = logging.getLogger(__name__) -class Datacite(Transformer): +class Datacite(XmlTransformer): """Datacite transformer.""" def get_optional_fields(self, xml: Tag) -> Optional[dict]: diff --git a/transmogrifier/sources/dspace_dim.py b/transmogrifier/sources/dspace_dim.py index 3c3582c..cee4229 100644 --- a/transmogrifier/sources/dspace_dim.py +++ b/transmogrifier/sources/dspace_dim.py @@ -5,12 +5,12 @@ import transmogrifier.models as timdex from transmogrifier.helpers import validate_date, validate_date_range -from transmogrifier.sources.transformer import Transformer +from transmogrifier.sources.transformer import XmlTransformer logger = logging.getLogger(__name__) -class DspaceDim(Transformer): +class DspaceDim(XmlTransformer): """DSpace DIM transformer.""" def get_optional_fields(self, xml: Tag) -> Optional[dict]: diff --git a/transmogrifier/sources/dspace_mets.py b/transmogrifier/sources/dspace_mets.py index ce4cb64..77c9cb8 100644 --- a/transmogrifier/sources/dspace_mets.py +++ b/transmogrifier/sources/dspace_mets.py @@ -5,12 +5,12 @@ import transmogrifier.models as timdex from transmogrifier.helpers import validate_date -from transmogrifier.sources.transformer import Transformer +from transmogrifier.sources.transformer import XmlTransformer logger = logging.getLogger(__name__) -class DspaceMets(Transformer): +class DspaceMets(XmlTransformer): """DSpace METS transformer.""" def get_optional_fields(self, xml: Tag) -> dict: diff --git a/transmogrifier/sources/ead.py b/transmogrifier/sources/ead.py index d82b1dc..82cdbad 100644 --- a/transmogrifier/sources/ead.py +++ b/transmogrifier/sources/ead.py @@ -6,7 +6,7 @@ import transmogrifier.models as timdex from transmogrifier.config import load_external_config from transmogrifier.helpers import validate_date, validate_date_range -from transmogrifier.sources.transformer import Transformer +from transmogrifier.sources.transformer import XmlTransformer logger = logging.getLogger(__name__) @@ -16,7 +16,7 @@ ) -class Ead(Transformer): +class Ead(XmlTransformer): """EAD transformer.""" def get_optional_fields(self, xml: Tag) -> Optional[dict]: diff --git a/transmogrifier/sources/marc.py b/transmogrifier/sources/marc.py index 95240bf..0077cc4 100644 --- a/transmogrifier/sources/marc.py +++ b/transmogrifier/sources/marc.py @@ -6,7 +6,7 @@ import transmogrifier.models as timdex from transmogrifier.config import load_external_config from transmogrifier.helpers import validate_date -from transmogrifier.sources.transformer import Transformer +from transmogrifier.sources.transformer import XmlTransformer logger = logging.getLogger(__name__) @@ -30,7 +30,7 @@ ) -class Marc(Transformer): +class Marc(XmlTransformer): """Marc transformer.""" def get_optional_fields(self, xml: Tag) -> Optional[dict]: diff --git a/transmogrifier/sources/oaidc.py b/transmogrifier/sources/oaidc.py index 6453e58..5645e17 100644 --- a/transmogrifier/sources/oaidc.py +++ b/transmogrifier/sources/oaidc.py @@ -5,12 +5,12 @@ import transmogrifier.models as timdex from transmogrifier.helpers import validate_date -from transmogrifier.sources.transformer import Transformer +from transmogrifier.sources.transformer import XmlTransformer logger = logging.getLogger(__name__) -class OaiDc(Transformer): +class OaiDc(XmlTransformer): """ Generic OAI DC transformer. diff --git a/transmogrifier/sources/transformer.py b/transmogrifier/sources/transformer.py index 4072a16..cc83798 100644 --- a/transmogrifier/sources/transformer.py +++ b/transmogrifier/sources/transformer.py @@ -12,11 +12,12 @@ logger = logging.getLogger(__name__) -class Transformer(object): +class XmlTransformer(object): """Base transformer class.""" __metaclass__ = ABCMeta + @final def __init__(self, source: str, input_records: Iterator[Tag]) -> None: """ Initialize Transformer instance. @@ -34,17 +35,19 @@ def __init__(self, source: str, input_records: Iterator[Tag]) -> None: self.skipped_record_count = 0 self.deleted_records: list[str] = [] + @final def __iter__(self) -> Iterator[TimdexRecord]: """Iterate over transformed records.""" return self + @final def __next__(self) -> TimdexRecord: """Return next transformed record.""" while True: - xml = next(self.input_records) + source_record = next(self.input_records) self.processed_record_count += 1 try: - record = self.transform(xml) + record = self.transform(source_record) except DeletedRecord as error: self.deleted_records.append(error.timdex_record_id) continue @@ -56,72 +59,76 @@ def __next__(self) -> TimdexRecord: continue @abstractmethod - def get_optional_fields(self, xml: Tag) -> Optional[dict]: + def get_optional_fields(self, source_record: Tag) -> Optional[dict]: """ Retrieve optional TIMDEX fields from an XML record. Must be overridden by source subclasses. Args: - xml: A BeautifulSoup Tag representing a single XML record + source_record: A BeautifulSoup Tag representing a single XML record """ return {} @classmethod @abstractmethod - def get_main_titles(cls, xml: Tag) -> list[Tag]: + def get_main_titles(cls, source_record: Tag) -> list[Tag]: """ Retrieve main title(s) from an XML record. Must be overridden by source subclasses. Args: - xml: A BeautifulSoup Tag representing a single XML record + source_record: A BeautifulSoup Tag representing a single XML record """ return [] @classmethod - def get_source_record_id(cls, xml: Tag) -> str: + def get_source_record_id(cls, source_record: Tag) -> str: """ Get or generate a source record ID from an XML record. May be overridden by source subclasses if needed. Args: - xml: A BeautifulSoup Tag representing a single XML record + source_record: A BeautifulSoup Tag representing a single XML record """ - return str(xml.header.find("identifier").string) + return str(source_record.header.find("identifier").string) @classmethod - def record_is_deleted(cls, xml: Tag) -> bool: + def record_is_deleted(cls, source_record: Tag) -> bool: """ Determine whether record has a status of deleted. May be overridden by source subclasses if needed. Args: - xml: A BeautifulSoup Tag representing a single XML record + source_record: A BeautifulSoup Tag representing a single XML record """ - if xml.find("header", status="deleted"): + if source_record.find("header", status="deleted"): return True return False @final - def get_required_fields(self, xml: Tag) -> dict: + def get_required_fields(self, source_record: Tag) -> dict: """ Get required TIMDEX fields from an XML record. May not be overridden. Args: - xml: A BeautifulSoup Tag representing a single OAI-PMH XML record. + source_record: A BeautifulSoup Tag representing a single OAI-PMH XML record. """ - source_record_id = self.get_source_record_id(xml) + source_record_id = self.get_source_record_id(source_record) # run methods to generate required fields - source_link = self.get_source_link(self.source_base_url, source_record_id, xml) - timdex_record_id = self.get_timdex_record_id(self.source, source_record_id, xml) - title = self.get_valid_title(source_record_id, xml) + source_link = self.get_source_link( + self.source_base_url, source_record_id, source_record + ) + timdex_record_id = self.get_timdex_record_id( + self.source, source_record_id, source_record + ) + title = self.get_valid_title(source_record_id, source_record) return { "source": self.source_name, @@ -131,25 +138,25 @@ def get_required_fields(self, xml: Tag) -> dict: } @final - def transform(self, xml: Tag) -> Optional[TimdexRecord]: + def transform(self, source_record: Tag) -> Optional[TimdexRecord]: """ Transform an OAI-PMH XML record into a TIMDEX record. May not be overridden. Args: - xml: A BeautifulSoup Tag representing a single OAI-PMH XML record. + source_record: A BeautifulSoup Tag representing a single OAI-PMH XML record. """ - if self.record_is_deleted(xml): - source_record_id = self.get_source_record_id(xml) + if self.record_is_deleted(source_record): + source_record_id = self.get_source_record_id(source_record) timdex_record_id = f"{self.source}:{source_record_id.replace('/', '-')}" raise DeletedRecord(timdex_record_id) - optional_fields = self.get_optional_fields(xml) + optional_fields = self.get_optional_fields(source_record) if optional_fields is None: return None else: fields = { - **self.get_required_fields(xml), + **self.get_required_fields(source_record), **optional_fields, } @@ -163,7 +170,7 @@ def transform(self, xml: Tag) -> Optional[TimdexRecord]: @final @classmethod - def get_valid_title(cls, source_record_id: str, xml: Tag) -> str: + def get_valid_title(cls, source_record_id: str, source_record: Tag) -> str: """ Retrieves main title(s) from an XML record and returns a valid title string. @@ -175,9 +182,9 @@ def get_valid_title(cls, source_record_id: str, xml: Tag) -> str: Args: source_record_id: Record identifier for the source record. - xml: A BeautifulSoup Tag representing a single XML record. + source_record: A BeautifulSoup Tag representing a single XML record. """ - all_titles = cls.get_main_titles(xml) + all_titles = cls.get_main_titles(source_record) if len(all_titles) > 1: logger.warning( "Record %s has multiple titles. Using the first title from the " @@ -199,7 +206,7 @@ def get_valid_title(cls, source_record_id: str, xml: Tag) -> str: @classmethod def get_source_link( - cls, source_base_url: str, source_record_id: str, xml: Tag + cls, source_base_url: str, source_record_id: str, source_record: Tag ) -> str: """ Class method to set the source link for the item. @@ -211,14 +218,16 @@ def get_source_link( Args: source_base_url: Source base URL. source_record_id: Record identifier for the source record. - xml: A BeautifulSoup Tag representing a single XML record. + source_record: A BeautifulSoup Tag representing a single XML record. - not used by default implementation, but could be useful for subclass overrides """ return source_base_url + source_record_id @classmethod - def get_timdex_record_id(cls, source: str, source_record_id: str, xml: Tag) -> str: + def get_timdex_record_id( + cls, source: str, source_record_id: str, source_record: Tag + ) -> str: """ Class method to set the TIMDEX record id. @@ -229,7 +238,7 @@ def get_timdex_record_id(cls, source: str, source_record_id: str, xml: Tag) -> s Args: source: Source name. source_record_id: Record identifier for the source record. - xml: A BeautifulSoup Tag representing a single XML record. + source_record: A BeautifulSoup Tag representing a single XML record. - not used by default implementation, but could be useful for subclass overrides """