From 9a736b952ae692ca9dce2ddd79b87de42cf50187 Mon Sep 17 00:00:00 2001 From: jonavellecuerdo Date: Thu, 30 May 2024 15:38:08 -0400 Subject: [PATCH] Field method refactor for Ead transform * Add field methods and corresponding unit tests: contents, contributors, dates, and identifiers * Update generate_name_identifier_url * Clarify that it is a private method associated with 'get_contributors' * Update syntax for 'create_string_*' and 'create_list_*' methods to use keyword args --- tests/sources/xml/test_ead.py | 287 +++++++++++++++++++++++++++++- transmogrifier/sources/xml/ead.py | 262 ++++++++++++++++----------- 2 files changed, 433 insertions(+), 116 deletions(-) diff --git a/tests/sources/xml/test_ead.py b/tests/sources/xml/test_ead.py index 8ceb1b0..ee4599a 100644 --- a/tests/sources/xml/test_ead.py +++ b/tests/sources/xml/test_ead.py @@ -586,15 +586,6 @@ def test_ead_transform_with_invalid_date_and_date_range_omits_dates(caplog): assert ("has a later start date than end date: '2001', '1999'") in caplog.text -def test_ead_transform_with_multiple_unitid_gets_valid_ids(): - ead_xml_records = Ead.parse_source_file( - "tests/fixtures/ead/ead_record_attribute_and_subfield_variations.xml" - ) - output_record = next(Ead("aspace", ead_xml_records)) - for identifier in output_record.identifiers: - assert identifier.value != "unitid-that-should-not-be-identifier" - - def test_get_alternate_titles_success(): source_record = create_ead_source_record_stub( metadata_insert=( @@ -739,6 +730,284 @@ def test_get_content_type_transforms_correctly_if_fields_missing(): assert Ead.get_content_type(source_record) == ["Archival materials"] +def test_get_contents_success(): + source_record = create_ead_source_record_stub( + metadata_insert=( + """ + + Arrangement +

This collection is organized into ten series:

+

Series 1. Charles J. Connick and Connick Studio documents

+

Series 2. Charles J. Connick Studio and Associates job information

+

Series 3. Charles J. Connick Stained Glass Foundation documents

+
+ """ + ), + parent_element="archdesc", + ) + assert Ead.get_contents(source_record) == [ + "This collection is organized into ten series:", + "Series 1. Charles J. Connick and Connick Studio documents", + "Series 2. Charles J. Connick Studio and Associates job information", + "Series 3. Charles J. Connick Stained Glass Foundation documents", + ] + + +def test_get_contents_transforms_correctly_if_fields_blank(): + source_record = create_ead_source_record_stub( + metadata_insert=( + """ + + """ + ), + parent_element="archdesc", + ) + assert Ead.get_contents(source_record) is None + + +def test_get_contents_transforms_correctly_if_fields_missing(): + source_record = create_ead_source_record_stub(parent_element="archdesc") + assert Ead.get_contents(source_record) is None + + +def test_get_contributors_success(): + source_record = create_ead_source_record_stub( + metadata_insert=( + """ + + + Author, Best E. + ( Best Ever ) + + + """ + ), + parent_element="did", + ) + assert Ead.get_contributors(source_record) == [ + timdex.Contributor(value="Author, Best E. ( Best Ever )", kind="Creator") + ] + + +def test_get_contributors_transforms_correctly_if_fields_blank(): + source_record = create_ead_source_record_stub( + metadata_insert=( + """ + + """ + ), + parent_element="did", + ) + assert Ead.get_contributors(source_record) is None + + +def test_get_contributors_transforms_correctly_if_fields_missing(): + source_record = create_ead_source_record_stub(parent_element="did") + assert Ead.get_contributors(source_record) is None + + +def test_get_contributors_transforms_correctly_if_multiple_contributors(): + source_record = create_ead_source_record_stub( + metadata_insert=( + """ + + + Author, Best E. + ( Best Ever ) + + + Author, Better + + """ + ), + parent_element="did", + ) + assert Ead.get_contributors(source_record) == [ + timdex.Contributor( + value="Author, Best E. ( Best Ever )", + kind="Creator", + ), + timdex.Contributor( + value="Author, Better", + kind="Creator", + ), + ] + + +def test_get_contributors_transforms_correctly_with_source_based_identifiers(): + source_record = create_ead_source_record_stub( + metadata_insert=( + """ + + + Author, Best E. + ( Best Ever ) + + + Author, Better + + + + Fambam + + """ + ), + parent_element="did", + ) + assert Ead.get_contributors(source_record) == [ + timdex.Contributor( + value="Author, Best E. ( Best Ever )", + identifier=["https://lccn.loc.gov/a001"], + kind="Creator", + ), + timdex.Contributor( + value="Author, Better", + identifier=["http://viaf.org/viaf/b001"], + kind="Creator", + ), + timdex.Contributor( + value="Fambam", identifier=["https://snaccooperative.org/view/c001"] + ), + ] + + +def test_get_dates_success(): + source_record = create_ead_source_record_stub( + header_insert=( + """ + oai:mit//repositories/2/resources/1 + """ + ), + metadata_insert=( + """ + + 1905-2012 + + 2023-01-01 + """ + ), + parent_element="did", + ) + assert Ead.get_dates(source_record) == [ + timdex.Date( + kind="creation", + note="approximate", + range=timdex.DateRange(gte="1905", lte="2012"), + ), + timdex.Date(value="2023-01-01"), + ] + + +def test_get_dates_transforms_correctly_if_fields_blank(): + source_record = create_ead_source_record_stub( + header_insert=( + """ + oai:mit//repositories/2/resources/1 + """ + ), + metadata_insert=( + """ + + """ + ), + parent_element="did", + ) + assert Ead.get_dates(source_record) is None + + +def test_get_dates_transforms_correctly_if_fields_missing(): + source_record = create_ead_source_record_stub( + header_insert=( + """ + oai:mit//repositories/2/resources/1 + """ + ), + parent_element="did", + ) + assert Ead.get_dates(source_record) is None + + +def test_get_dates_transforms_correctly_if_date_invalid(): + source_record = create_ead_source_record_stub( + header_insert=( + """ + oai:mit//repositories/2/resources/1 + """ + ), + metadata_insert=( + """ + + INVALID + + """ + ), + parent_element="did", + ) + assert Ead.get_dates(source_record) is None + + +def test_get_dates_transforms_correctly_if_normal_attribute_missing(): + source_record = create_ead_source_record_stub( + header_insert=( + """ + oai:mit//repositories/2/resources/1 + """ + ), + metadata_insert=( + """ + 2024 + """ + ), + parent_element="did", + ) + assert Ead.get_dates(source_record) is None + + +def test_get_identifiers_success(): + source_record = create_ead_source_record_stub( + metadata_insert=( + """ + a001 + """ + ), + parent_element="did", + ) + assert Ead.get_identifiers(source_record) == [ + timdex.Identifier(value="a001", kind="Collection Identifier") + ] + + +def test_get_identifiers_transforms_correctly_if_fields_blank(): + source_record = create_ead_source_record_stub( + metadata_insert=( + """ + + """ + ), + parent_element="did", + ) + assert Ead.get_identifiers(source_record) is None + + +def test_get_identifiers_transforms_correctly_if_fields_missing(): + source_record = create_ead_source_record_stub( + parent_element="did", + ) + assert Ead.get_identifiers(source_record) is None + + +def test_get_identifiers_transforms_correctly_if_type_attribute_invalid(): + source_record = create_ead_source_record_stub( + metadata_insert=( + """ + ignore-me + """ + ), + parent_element="did", + ) + assert Ead.get_identifiers(source_record) is None + + def test_get_main_titles_success(): source_record = create_ead_source_record_stub( metadata_insert=( diff --git a/transmogrifier/sources/xml/ead.py b/transmogrifier/sources/xml/ead.py index 970f455..6fb3366 100644 --- a/transmogrifier/sources/xml/ead.py +++ b/transmogrifier/sources/xml/ead.py @@ -29,8 +29,6 @@ def get_optional_fields(self, source_record: Tag) -> dict | None: """ fields: dict = {} - source_record_id = self.get_source_record_id(source_record) - # and elements are required when deriving optional fields collection_description = self._get_collection_description(source_record) collection_description_did = self._get_collection_description_did(source_record) @@ -48,33 +46,15 @@ def get_optional_fields(self, source_record: Tag) -> dict | None: # content_type fields["content_type"] = self.get_content_type(source_record) + # contents - for arrangement_element in collection_description.find_all( - "arrangement", recursive=False - ): - for arrangement_value in self.create_list_from_mixed_value( - arrangement_element, skipped_elements=["head"] - ): - fields.setdefault("contents", []).append(arrangement_value) + fields["contents"] = self.get_contents(source_record) # contributors - for origination_element in collection_description_did.find_all("origination"): - for name_element in origination_element.find_all(name=True, recursive=False): - if name_value := self.create_string_from_mixed_value( - name_element, separator=" " - ): - fields.setdefault("contributors", []).append( - timdex.Contributor( - value=name_value, - kind=origination_element.get("label") or None, - identifier=self.generate_name_identifier_url(name_element), - ) - ) + fields["contributors"] = self.get_contributors(source_record) # dates - dates = self.parse_dates(collection_description_did, source_record_id) - if dates: - fields.setdefault("dates", []).extend(dates) + fields["dates"] = self.get_dates(source_record) # edition field not used in EAD @@ -88,16 +68,7 @@ def get_optional_fields(self, source_record: Tag) -> dict | None: # holdings omitted pending discussion on how to map originalsloc and physloc # identifiers - for id_element in collection_description_did.find_all("unitid", recursive=False): - if id_element.get("type") == "aspace_uri": - continue - if id_value := self.create_string_from_mixed_value( - id_element, - separator=" ", - ): - fields.setdefault("identifiers", []).append( - timdex.Identifier(value=id_value, kind="Collection Identifier") - ) + fields["identifiers"] = self.get_identifiers(source_record) # languages for langmaterial_element in collection_description_did.find_all( @@ -306,28 +277,6 @@ def create_string_from_mixed_value( cls.create_list_from_mixed_value(xml_element, skipped_elements) ) - @classmethod - def generate_name_identifier_url(cls, name_element: Tag) -> list | None: - """ - Generate a full name identifier URL with the specified scheme. - - Args: - name_element: A BeautifulSoup Tag representing an EAD - name XML field. - """ - if identifier := name_element.get("authfilenumber"): - source = name_element.get("source") - if source in ["lcnaf", "naf"]: - base_url = "https://lccn.loc.gov/" - elif source == "snac": - base_url = "https://snaccooperative.org/view/" - elif source == "viaf": - base_url = "http://viaf.org/viaf/" - else: - base_url = "" - return [base_url + identifier] - return None - @classmethod def _get_collection_description(cls, source_record: Tag) -> Tag: """Get element with archival description for a collection. @@ -425,6 +374,156 @@ def get_content_type(cls, source_record: Tag) -> list[str] | None: return content_types or None + @classmethod + def get_contents(cls, source_record: Tag) -> list[str] | None: + contents = [] + collection_description = cls._get_collection_description(source_record) + for arrangement_element in collection_description.find_all( + "arrangement", recursive=False + ): + contents.extend( + cls.create_list_from_mixed_value( + arrangement_element, skipped_elements=["head"] + ) + ) + return contents or None + + @classmethod + def get_contributors(cls, source_record: Tag) -> list[timdex.Contributor] | None: + contributors = [] + collection_description_did = cls._get_collection_description_did(source_record) + for origination_element in collection_description_did.find_all("origination"): + contributors.extend( + [ + timdex.Contributor( + value=contributor_name, + kind=origination_element.get("label") or None, + identifier=cls._get_contributor_identifier_url( + contributor_element + ), + ) + for contributor_element in origination_element.find_all( + name=True, recursive=False + ) + if ( + contributor_name := cls.create_string_from_mixed_value( + contributor_element, separator=" " + ) + ) + ] + ) + + return contributors or None + + @classmethod + def _get_contributor_identifier_url(cls, name_element: Tag) -> list | None: + """Generate a full name identifier URL with the specified scheme. + + Args: + name_element: A BeautifulSoup Tag representing an EAD + name XML field. + """ + if identifier := name_element.get("authfilenumber"): + source = name_element.get("source") + if source in ["lcnaf", "naf"]: + base_url = "https://lccn.loc.gov/" + elif source == "snac": + base_url = "https://snaccooperative.org/view/" + elif source == "viaf": + base_url = "http://viaf.org/viaf/" + else: + base_url = "" + return [base_url + identifier] + return None + + @classmethod + def get_dates(cls, source_record: Tag) -> list[timdex.Date] | None: + dates = [] + collection_description_did = cls._get_collection_description_did(source_record) + source_record_id = cls.get_source_record_id(source_record) + dates.extend( + cls._parse_date_elements(collection_description_did, source_record_id) + ) + return dates or None + + @classmethod + def _parse_date_elements( + cls, collection_description_did: Tag, source_record_id: str + ) -> list[timdex.Date]: + """ + Dates are derived from elements with the @normal attribute. + + The method will iterate over these elements, validating the values assigned to + the @normal attribute. If the date value passes validation, a timdex.Date + instance is created and added to the list of valid timdex.Dates that is + returned. + + Note: If the date values in a provided date range are the same, a + single date is retrieved. + """ + dates = [] + for date_element in collection_description_did.find_all("unitdate", normal=True): + date_string = date_element.get("normal", "").strip() + + if date_string == "": + continue + + date_instance = timdex.Date() + + # get valid date ranges or dates + if "/" in date_string: + date_instance = cls._parse_date_range(date_string, source_record_id) + elif validate_date(date_string, source_record_id): + date_instance.value = date_string + + # include @datechar and @certainty attributes + date_instance.kind = date_element.get("datechar") + date_instance.note = date_element.get("certainty") + + if date_instance.range or date_instance.value: + dates.append(date_instance) + return dates + + @classmethod + def _parse_date_range(cls, date_string: str, source_record_id: str) -> timdex.Date: + date_instance = timdex.Date() + gte_date, lte_date = date_string.split("/") + if gte_date != lte_date: + if validate_date_range( + gte_date, + lte_date, + source_record_id, + ): + date_instance.range = timdex.DateRange( + gte=gte_date, + lte=lte_date, + ) + else: + # get valid date (if dates in ranges are the same) + date_string = gte_date + if validate_date( + date_string, + source_record_id, + ): + date_instance.value = date_string + return date_instance + + @classmethod + def get_identifiers(cls, source_record: Tag) -> list[timdex.Identifier] | None: + identifiers = [] + collection_description_did = cls._get_collection_description_did(source_record) + for id_element in collection_description_did.find_all("unitid", recursive=False): + if id_element.get("type") == "aspace_uri": + continue + if id_value := cls.create_string_from_mixed_value( + id_element, + separator=" ", + ): + identifiers.append( + timdex.Identifier(value=id_value, kind="Collection Identifier") + ) + return identifiers or None + @classmethod def get_main_titles(cls, source_record: Tag) -> list[str]: """ @@ -486,54 +585,3 @@ def parse_mixed_value( elif isinstance(item, Tag) and item.name not in skipped_elements: for child in item.children: yield from cls.parse_mixed_value(child, skipped_elements) - - def parse_dates( - self, collection_description_did: Tag, source_record_id: str - ) -> list[timdex.Date]: - """ - Dedicated method to parse dates. Targeting archdesc.unitdata elements, using - only those with a @normal attribute value. These are almost uniformly ranges, - but in the event they are not (or two identical values for the range) a single - date value is produced. - """ - dates = [] - for date_element in collection_description_did.find_all("unitdate"): - normal_date = date_element.get("normal", "").strip() - if normal_date == "": - continue - - date_instance = timdex.Date() - - # date range - if "/" in normal_date: - gte_date, lte_date = normal_date.split("/") - if gte_date != lte_date: - if validate_date_range( - gte_date, - lte_date, - source_record_id, - ): - date_instance.range = timdex.DateRange( - gte=gte_date, - lte=lte_date, - ) - else: - date_str = gte_date # arbitrarily take one - if validate_date( - date_str, - source_record_id, - ): - date_instance.value = date_str - - # fallback on single date - elif validate_date(normal_date, source_record_id): - date_instance.value = normal_date - - # include @datechar and @certainty attributes - date_instance.kind = date_element.get("datechar") - date_instance.note = date_element.get("certainty") - - if date_instance.range or date_instance.value: - dates.append(date_instance) - - return dates