Skip to content

Commit

Permalink
Create Transformer base class
Browse files Browse the repository at this point in the history
Why these changes are being introduced:
* A format-agnostic Transformer base class is needed for deriving both XmlTransformer and JsonTransformer format classes

How this addresses that need:
* Create a Transformer base class
* Add JSON type for validation
* Refactor XmlTransformer to derive from Transformer class
* Rename arg xml > source_record and update docstrings

Side effects of this change:
* None

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/GDT-82
  • Loading branch information
ehanson8 committed Nov 20, 2023
1 parent 43dba21 commit 1989ca3
Show file tree
Hide file tree
Showing 2 changed files with 185 additions and 13 deletions.
10 changes: 9 additions & 1 deletion tests/test_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,15 @@
from transmogrifier.helpers import parse_xml_records
from transmogrifier.models import TimdexRecord
from transmogrifier.sources.datacite import Datacite
from transmogrifier.sources.transformer import XmlTransformer
from transmogrifier.sources.transformer import Transformer, XmlTransformer


def test_transformer_initializes_with_expected_attributes(oai_pmh_records):
transformer = Transformer("cool-repo", oai_pmh_records)
assert transformer.source == "cool-repo"
assert transformer.source_base_url == "https://example.com/"
assert transformer.source_name == "A Cool Repository"
assert transformer.input_records == oai_pmh_records


def test_xmltransformer_initializes_with_expected_attributes(oai_pmh_records):
Expand Down
188 changes: 176 additions & 12 deletions transmogrifier/sources/transformer.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,45 @@
"""Transformer module."""
import logging
from abc import ABCMeta, abstractmethod
from typing import Iterator, Optional, final
from typing import Iterator, Optional, TypeAlias, final

from bs4 import Tag
from bs4 import BeautifulSoup, Tag

# Note: the lxml module in defusedxml is deprecated, so we have to use the
# regular lxml library. Transmogrifier only parses data from known sources so this
# should not be a security issue.
from lxml import etree # nosec B410

from transmogrifier.config import SOURCES
from transmogrifier.helpers import DeletedRecord, generate_citation
from transmogrifier.models import TimdexRecord

logger = logging.getLogger(__name__)

JSON: TypeAlias = dict[str, "JSON"] | list["JSON"] | str | int | float | bool | None


class XmlTransformer(object):
class Transformer(object):
"""Base transformer class."""

__metaclass__ = ABCMeta

@final
def __init__(self, source: str, input_records: Iterator[Tag]) -> None:
def __init__(self, source: str, input_records: Iterator[JSON | Tag]) -> None:
"""
Initialize Transformer instance.
Args:
source: Source repository short label. Must match a source key from
config.SOURCES
source: Source repository label. Must match a source key from config.SOURCES.
input_records: A set of source records to be processed.
"""
self.source = source
self.source_base_url = SOURCES[source]["base-url"]
self.source: str = source
self.source_base_url: str = SOURCES[source]["base-url"]
self.source_name = SOURCES[source]["name"]
self.input_records = input_records
self.processed_record_count = 0
self.transformed_record_count = 0
self.skipped_record_count = 0
self.input_records: Iterator[JSON | Tag] = input_records
self.processed_record_count: int = 0
self.transformed_record_count: int = 0
self.skipped_record_count: int = 0
self.deleted_records: list[str] = []

@final
Expand All @@ -58,6 +65,163 @@ def __next__(self) -> TimdexRecord:
self.skipped_record_count += 1
continue

@classmethod
@abstractmethod
def parse_source_file(cls, input_file: str) -> Iterator[JSON | Tag]:
"""
Parse source file and return source records via an iterator.
Must be overridden by format subclasses.
"""
pass

@abstractmethod
def get_optional_fields(self, source_record: JSON | Tag) -> Optional[dict]:
"""
Retrieve optional TIMDEX fields from a source record.
Must be overridden by source subclasses.
Args:
source_record: A single source record.
"""
return {}

@classmethod
@abstractmethod
def get_main_titles(cls, source_record: JSON | Tag) -> list[Tag | str]:
"""
Retrieve main title(s) from an source record.
Must be overridden by source subclasses.
Args:
source_record: A single source record.
"""
return []

@classmethod
@abstractmethod
def get_source_record_id(cls, source_record: JSON | Tag) -> str:
"""
Get or generate a source record ID from a source record.
Must be overridden by source subclasses.
Args:
source_record: A single source record.
"""
return ""

@classmethod
@abstractmethod
def record_is_deleted(cls, source_record: JSON | Tag) -> bool:
"""
Determine whether record has a status of deleted.
Must be overridden by source subclasses.
Args:
source_record: A single source record.
"""
return False

@abstractmethod
def get_required_fields(self, source_record: JSON | Tag) -> dict:
"""
Get required TIMDEX fields from a source record.
Must be overridden by format subclasses.
Args:
source_record: A single source record.
"""
return {}

@abstractmethod
def transform(self, source_record: JSON | Tag) -> Optional[TimdexRecord]:
"""
Transform a source record into a TIMDEX record.
Must be overridden by format subclasses.
Args:
source_record: A single source record.
"""
return None

@classmethod
@abstractmethod
def get_valid_title(cls, source_record_id: str, source_record: JSON | Tag) -> str:
"""
Retrieves main title(s) from a source record and returns a valid title string.
Must be overridden by source subclasses.
Args:
source_record_id: Record identifier for the source record.
source_record: A single source record.
"""
return ""

@classmethod
@abstractmethod
def get_source_link(
cls, source_base_url: str, source_record_id: str, source_record: JSON | Tag
) -> str:
"""
Class method to set the source link for the item.
Must be overridden by source subclasses.
Args:
source_base_url: Source base URL.
source_record_id: Record identifier for the source record.
source_record: A single source record.
"""
return ""

@classmethod
@abstractmethod
def get_timdex_record_id(
cls, source: str, source_record_id: str, source_record: Tag
) -> str:
"""
Class method to set the TIMDEX record id.
Must be overridden by source subclasses.
Args:
source: Source name.
source_record_id: Record identifier for the source record.
source_record: A single source record.
"""
return ""


class XmlTransformer(Transformer):
"""Base transformer class."""

@final
@classmethod
def parse_source_file(cls, input_file: str) -> Iterator[Tag]:
"""
Parse source file and return source records via an iterator.
May not be overridden.
"""
with open(input_file, "rb") as file:
for _, element in etree.iterparse(
file,
tag="{*}record",
encoding="utf-8",
recover=True,
):
record_string = etree.tostring(element, encoding="utf-8")
record = BeautifulSoup(record_string, "xml")
yield record
element.clear()

@abstractmethod
def get_optional_fields(self, source_record: Tag) -> Optional[dict]:
"""
Expand Down

0 comments on commit 1989ca3

Please sign in to comment.