Skip to content

Commit

Permalink
Gdt 83 jsontransformer (#107)
Browse files Browse the repository at this point in the history
* Transformer and XmlTransformer refactor

* Retype Transformer class as ABC
* Reorder Transformer class methods for more logical flow
* Replace abstractmethod return statements with pass
* Shift get_valid_title method to Transformer base class final method
* Remove outdated Transformer class test
* Update get_main_titles method in datacite, dspace_dim, dspace_mets, and oaidc transforms to return str values

* Create JsonTransformer class

Why these changes are being introduced:
* A generic JSON format class is needed as a base class for JSON metadata formats such as aardvark

How this addresses that need:
* Create JsonTransformer class
* Rename write_timdex_records_to_json > write_timdex_records_to_json_file
* Shift write_deleted_records_to_file from helpers module to Transformer method and rename to write_deleted_records_to_txt_file
* Shift CLI codeblock to Transformer.write_output_files method
* Add corresponding unit tests for write_output_files method

Side effects of this change:
* None

Relevant ticket(s):
* https://mitlibraries.atlassian.net/browse/GDT-83

* Updates based on discussion in PR #107

* Rename _write_output_files > transform_and_write_output_files
* Rename write_timdex_records_to_json_file > _write_timdex_records_to_json_file
* Rename write_deleted_records_to_txt_file > _write_deleted_records_to_txt_file
* Add docstrings
* Add abstractmethod decorator to get_main_titles
  • Loading branch information
ehanson8 authored Dec 6, 2023
1 parent ee4da86 commit 62d8e79
Show file tree
Hide file tree
Showing 8 changed files with 366 additions and 176 deletions.
38 changes: 27 additions & 11 deletions tests/test_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,6 @@
from transmogrifier.sources.transformer import Transformer, XmlTransformer


def test_transformer_initializes_with_expected_attributes(oai_pmh_records):
transformer = Transformer("cool-repo", oai_pmh_records)
assert transformer.source == "cool-repo"
assert transformer.source_base_url == "https://example.com/"
assert transformer.source_name == "A Cool Repository"
assert transformer.source_records == oai_pmh_records


def test_transformer_get_transformer_returns_correct_class_name():
assert Transformer.get_transformer("jpal") == Datacite

Expand Down Expand Up @@ -65,6 +57,31 @@ def test_xmltransformer_iterates_successfully_if_get_optional_fields_returns_non
assert len(output_records.deleted_records) == 1


def test_xmltransformer_transform_and_write_output_files_writes_output_files(
tmp_path, oai_pmh_records
):
output_file = str(tmp_path / "output_file.json")
transformer = XmlTransformer("cool-repo", oai_pmh_records)
transformer.transform_and_write_output_files(output_file)
output_files = list(tmp_path.iterdir())
assert len(output_files) == 2
assert output_files[0].name == "output_file.json"
assert output_files[1].name == "output_file.txt"


def test_xmltransformer_transform_and_write_output_files_no_txt_file_if_not_needed(
tmp_path,
):
output_file = str(tmp_path / "output_file.json")
datacite_records = XmlTransformer.parse_source_file(
"tests/fixtures/datacite/datacite_records.xml"
)
transformer = XmlTransformer("cool-repo", datacite_records)
transformer.transform_and_write_output_files(output_file)
assert len(list(tmp_path.iterdir())) == 1
assert next(tmp_path.iterdir()).name == "output_file.json"


def test_xmltransformer_parse_source_file_returns_record_iterator():
records = XmlTransformer.parse_source_file(
"tests/fixtures/datacite/datacite_records.xml"
Expand Down Expand Up @@ -136,7 +153,6 @@ def test_xmltransformer_get_valid_title_with_title_field_multiple_logs_warning(c
)
assert (
"Record doi:10.7910/DVN/19PPE7 has multiple titles. Using the first title from "
"the following titles found: [<title>The Impact of Maternal Literacy and "
"Participation Programs</title>, <title>Additional Title</title>]"
in caplog.text
"the following titles found: ['The Impact of Maternal Literacy and "
"Participation Programs', 'Additional Title']" in caplog.text
)
10 changes: 1 addition & 9 deletions transmogrifier/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import click

from transmogrifier.config import SOURCES, configure_logger, configure_sentry
from transmogrifier.helpers import write_deleted_records_to_file
from transmogrifier.sources.transformer import Transformer

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -42,14 +41,7 @@ def main(source, input_file, output_file, verbose):
logger.info("Running transform for source %s", source)

transformer = Transformer.load(source, input_file)
transformer.write_timdex_records_to_json(output_file)
if transformer.processed_record_count == 0:
raise ValueError("No records processed from input file, needs investigation")
if deleted_records := transformer.deleted_records:
deleted_output_file = output_file.replace("index", "delete").replace(
"json", "txt"
)
write_deleted_records_to_file(deleted_records, deleted_output_file)
transformer.transform_and_write_output_files(output_file)
logger.info(
(
"Completed transform, total records processed: %d, "
Expand Down
8 changes: 0 additions & 8 deletions transmogrifier/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
from datetime import datetime
from typing import Optional

from smart_open import open

from transmogrifier.config import DATE_FORMATS

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -140,12 +138,6 @@ def validate_date_range(
return False


def write_deleted_records_to_file(deleted_records: list[str], output_file_path: str):
with open(output_file_path, "w") as file:
for record_id in deleted_records:
file.write(f"{record_id}\n")


class DeletedRecord(Exception):
"""Exception raised for records with a deleted status.
Expand Down
6 changes: 3 additions & 3 deletions transmogrifier/sources/datacite.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def get_optional_fields(self, xml: Tag) -> Optional[dict]:
for index, title in enumerate(self.get_main_titles(xml)):
if index > 0:
fields.setdefault("alternate_titles", []).append(
timdex.AlternateTitle(value=title.string)
timdex.AlternateTitle(value=title)
)

# content_type
Expand Down Expand Up @@ -309,7 +309,7 @@ def get_optional_fields(self, xml: Tag) -> Optional[dict]:
return fields

@classmethod
def get_main_titles(cls, xml: Tag) -> list[Tag]:
def get_main_titles(cls, xml: Tag) -> list[str]:
"""
Retrieve main title(s) from a Datacite XML record.
Expand All @@ -320,7 +320,7 @@ def get_main_titles(cls, xml: Tag) -> list[Tag]:
oai_datacite XML.
"""
return [
t
t.string
for t in xml.metadata.find_all("title", string=True)
if not t.get("titleType")
]
Expand Down
8 changes: 4 additions & 4 deletions transmogrifier/sources/dspace_dim.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def get_optional_fields(self, xml: Tag) -> Optional[dict]:
for index, title in enumerate(self.get_main_titles(xml)):
if index > 0:
fields.setdefault("alternate_titles", []).append(
timdex.AlternateTitle(value=title.string)
timdex.AlternateTitle(value=title)
)

# citation
Expand Down Expand Up @@ -280,7 +280,7 @@ def get_content_types(cls, xml: Tag) -> Optional[list[str]]:
] or None

@classmethod
def get_main_titles(cls, xml: Tag) -> list[Tag]:
def get_main_titles(cls, xml: Tag) -> list[str]:
"""
Retrieve main title(s) from a DSpace DIM XML record.
Expand All @@ -290,8 +290,8 @@ def get_main_titles(cls, xml: Tag) -> list[Tag]:
xml: A BeautifulSoup Tag representing a single DSpace DIM XML record.
"""
return [
t
for t in xml.find_all("dim:field", element="title")
t.string
for t in xml.find_all("dim:field", element="title", string=True)
if "qualifier" not in t.attrs
]

Expand Down
10 changes: 7 additions & 3 deletions transmogrifier/sources/dspace_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def get_optional_fields(self, xml: Tag) -> dict:
for index, title in enumerate(self.get_main_titles(xml)):
if index > 0:
fields.setdefault("alternate_titles", []).append(
timdex.AlternateTitle(value=title.string)
timdex.AlternateTitle(value=title)
)

# call_numbers: relevant field in DSpace (dc.subject.classification) is not
Expand Down Expand Up @@ -191,7 +191,7 @@ def get_optional_fields(self, xml: Tag) -> dict:
return fields

@classmethod
def get_main_titles(cls, xml: Tag) -> list[Tag]:
def get_main_titles(cls, xml: Tag) -> list[str]:
"""
Retrieve main title(s) from a DSpace METS XML record.
Expand All @@ -200,7 +200,11 @@ def get_main_titles(cls, xml: Tag) -> list[Tag]:
Args:
xml: A BeautifulSoup Tag representing a single DSpace METS XML record.
"""
return [t for t in xml.find_all("mods:title", string=True) if not t.get("type")]
return [
t.string
for t in xml.find_all("mods:title", string=True)
if not t.get("type")
]

@classmethod
def get_source_record_id(cls, xml: Tag) -> str:
Expand Down
4 changes: 2 additions & 2 deletions transmogrifier/sources/oaidc.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def get_links(self, source_record_id: str, xml: Tag) -> Optional[List[timdex.Lin
return None

@classmethod
def get_main_titles(cls, xml: Tag) -> list[Tag]:
def get_main_titles(cls, xml: Tag) -> list[str]:
"""
Retrieve main title(s) from a generic OAI DC XML record.
Expand All @@ -162,7 +162,7 @@ def get_main_titles(cls, xml: Tag) -> list[Tag]:
Args:
xml: A BeautifulSoup Tag representing a single OAI DC XML record.
"""
return [t for t in xml.find_all("dc:title")]
return [t.string for t in xml.find_all("dc:title", string=True)]

@classmethod
def get_source_record_id(cls, xml: Tag) -> str:
Expand Down
Loading

0 comments on commit 62d8e79

Please sign in to comment.