From 9a736b952ae692ca9dce2ddd79b87de42cf50187 Mon Sep 17 00:00:00 2001
From: jonavellecuerdo <cuerdojonavelle@gmail.com>
Date: Thu, 30 May 2024 15:38:08 -0400
Subject: [PATCH] Field method refactor for Ead transform

* Add field methods and corresponding unit tests:
  contents, contributors, dates, and identifiers
* Update generate_name_identifier_url
  * Clarify that it is a private method associated with 'get_contributors'
* Update syntax for 'create_string_*' and 'create_list_*' methods
  to use keyword args
---
 tests/sources/xml/test_ead.py     | 287 +++++++++++++++++++++++++++++-
 transmogrifier/sources/xml/ead.py | 262 ++++++++++++++++-----------
 2 files changed, 433 insertions(+), 116 deletions(-)
diff --git a/tests/sources/xml/test_ead.py b/tests/sources/xml/test_ead.py
index 8ceb1b0..ee4599a 100644
--- a/tests/sources/xml/test_ead.py
+++ b/tests/sources/xml/test_ead.py
@@ -586,15 +586,6 @@ def test_ead_transform_with_invalid_date_and_date_range_omits_dates(caplog):
     assert ("has a later start date than end date: '2001', '1999'") in caplog.text
 
 
-def test_ead_transform_with_multiple_unitid_gets_valid_ids():
-    ead_xml_records = Ead.parse_source_file(
-        "tests/fixtures/ead/ead_record_attribute_and_subfield_variations.xml"
-    )
-    output_record = next(Ead("aspace", ead_xml_records))
-    for identifier in output_record.identifiers:
-        assert identifier.value != "unitid-that-should-not-be-identifier"
-
-
 def test_get_alternate_titles_success():
     source_record = create_ead_source_record_stub(
         metadata_insert=(
@@ -739,6 +730,284 @@ def test_get_content_type_transforms_correctly_if_fields_missing():
     assert Ead.get_content_type(source_record) == ["Archival materials"]
 
 
+def test_get_contents_success():
+    source_record = create_ead_source_record_stub(
+        metadata_insert=(
+            """
+            <arrangement>
+                <head>Arrangement</head>
+                <p>This collection is organized into ten series: </p>
+                <p>Series 1. Charles J. Connick and Connick Studio documents</p>
+                <p>Series 2. Charles J. Connick Studio and Associates job information</p>
+                <p>Series 3. Charles J. Connick Stained Glass Foundation documents</p>
+            </arrangement>
+            """
+        ),
+        parent_element="archdesc",
+    )
+    assert Ead.get_contents(source_record) == [
+        "This collection is organized into ten series:",
+        "Series 1. Charles J. Connick and Connick Studio documents",
+        "Series 2. Charles J. Connick Studio and Associates job information",
+        "Series 3. Charles J. Connick Stained Glass Foundation documents",
+    ]
+
+
+def test_get_contents_transforms_correctly_if_fields_blank():
+    source_record = create_ead_source_record_stub(
+        metadata_insert=(
+            """
+            <arrangement></arrangement>
+            """
+        ),
+        parent_element="archdesc",
+    )
+    assert Ead.get_contents(source_record) is None
+
+
+def test_get_contents_transforms_correctly_if_fields_missing():
+    source_record = create_ead_source_record_stub(parent_element="archdesc")
+    assert Ead.get_contents(source_record) is None
+
+
+def test_get_contributors_success():
+    source_record = create_ead_source_record_stub(
+        metadata_insert=(
+            """
+            <origination label="Creator">
+                <persname>
+                    Author, Best E.
+                    <part>( <emph> Best <emph>Ever</emph> </emph> )</part>
+                </persname>
+            </origination>
+            """
+        ),
+        parent_element="did",
+    )
+    assert Ead.get_contributors(source_record) == [
+        timdex.Contributor(value="Author, Best E. ( Best Ever )", kind="Creator")
+    ]
+
+
+def test_get_contributors_transforms_correctly_if_fields_blank():
+    source_record = create_ead_source_record_stub(
+        metadata_insert=(
+            """
+            <origination></origination>
+            """
+        ),
+        parent_element="did",
+    )
+    assert Ead.get_contributors(source_record) is None
+
+
+def test_get_contributors_transforms_correctly_if_fields_missing():
+    source_record = create_ead_source_record_stub(parent_element="did")
+    assert Ead.get_contributors(source_record) is None
+
+
+def test_get_contributors_transforms_correctly_if_multiple_contributors():
+    source_record = create_ead_source_record_stub(
+        metadata_insert=(
+            """
+            <origination label="Creator">
+                <persname>
+                    Author, Best E.
+                    <part>( <emph> Best <emph>Ever</emph> </emph> )</part>
+                </persname>
+                <persname>
+                    Author, Better
+                </persname>
+            """
+        ),
+        parent_element="did",
+    )
+    assert Ead.get_contributors(source_record) == [
+        timdex.Contributor(
+            value="Author, Best E. ( Best Ever )",
+            kind="Creator",
+        ),
+        timdex.Contributor(
+            value="Author, Better",
+            kind="Creator",
+        ),
+    ]
+
+
+def test_get_contributors_transforms_correctly_with_source_based_identifiers():
+    source_record = create_ead_source_record_stub(
+        metadata_insert=(
+            """
+            <origination label="Creator">
+                <persname authfilenumber="a001" source="naf">
+                    Author, Best E.
+                    <part>( <emph> Best <emph>Ever</emph> </emph> )</part>
+                </persname>
+                <persname authfilenumber="b001" source="viaf">
+                    Author, Better
+                </persname>
+            </origination>
+            <origination>
+                <famname authfilenumber="c001" source="snac">Fambam</famname>
+            </famname>
+            """
+        ),
+        parent_element="did",
+    )
+    assert Ead.get_contributors(source_record) == [
+        timdex.Contributor(
+            value="Author, Best E. ( Best Ever )",
+            identifier=["https://lccn.loc.gov/a001"],
+            kind="Creator",
+        ),
+        timdex.Contributor(
+            value="Author, Better",
+            identifier=["http://viaf.org/viaf/b001"],
+            kind="Creator",
+        ),
+        timdex.Contributor(
+            value="Fambam", identifier=["https://snaccooperative.org/view/c001"]
+        ),
+    ]
+
+
+def test_get_dates_success():
+    source_record = create_ead_source_record_stub(
+        header_insert=(
+            """
+            <identifier>oai:mit//repositories/2/resources/1</identifier>
+            """
+        ),
+        metadata_insert=(
+            """
+            <unitdate certainty="approximate" datechar="creation" normal="1905/2012">
+                1905-2012
+            </unitdate>
+            <unitdate normal="2023-01-01">2023-01-01</unitdate>
+            """
+        ),
+        parent_element="did",
+    )
+    assert Ead.get_dates(source_record) == [
+        timdex.Date(
+            kind="creation",
+            note="approximate",
+            range=timdex.DateRange(gte="1905", lte="2012"),
+        ),
+        timdex.Date(value="2023-01-01"),
+    ]
+
+
+def test_get_dates_transforms_correctly_if_fields_blank():
+    source_record = create_ead_source_record_stub(
+        header_insert=(
+            """
+            <identifier>oai:mit//repositories/2/resources/1</identifier>
+            """
+        ),
+        metadata_insert=(
+            """
+            <unitdate certainty="approximate" datechar="creation" normal=""></unitdate>
+            """
+        ),
+        parent_element="did",
+    )
+    assert Ead.get_dates(source_record) is None
+
+
+def test_get_dates_transforms_correctly_if_fields_missing():
+    source_record = create_ead_source_record_stub(
+        header_insert=(
+            """
+            <identifier>oai:mit//repositories/2/resources/1</identifier>
+            """
+        ),
+        parent_element="did",
+    )
+    assert Ead.get_dates(source_record) is None
+
+
+def test_get_dates_transforms_correctly_if_date_invalid():
+    source_record = create_ead_source_record_stub(
+        header_insert=(
+            """
+            <identifier>oai:mit//repositories/2/resources/1</identifier>
+            """
+        ),
+        metadata_insert=(
+            """
+            <unitdate certainty="approximate" datechar="creation" normal="">
+                INVALID
+            </unitdate>
+            """
+        ),
+        parent_element="did",
+    )
+    assert Ead.get_dates(source_record) is None
+
+
+def test_get_dates_transforms_correctly_if_normal_attribute_missing():
+    source_record = create_ead_source_record_stub(
+        header_insert=(
+            """
+            <identifier>oai:mit//repositories/2/resources/1</identifier>
+            """
+        ),
+        metadata_insert=(
+            """
+            <unitdate certainty="approximate" datechar="creation">2024</unitdate>
+            """
+        ),
+        parent_element="did",
+    )
+    assert Ead.get_dates(source_record) is None
+
+
+def test_get_identifiers_success():
+    source_record = create_ead_source_record_stub(
+        metadata_insert=(
+            """
+            <unitid>a001</unitid>
+            """
+        ),
+        parent_element="did",
+    )
+    assert Ead.get_identifiers(source_record) == [
+        timdex.Identifier(value="a001", kind="Collection Identifier")
+    ]
+
+
+def test_get_identifiers_transforms_correctly_if_fields_blank():
+    source_record = create_ead_source_record_stub(
+        metadata_insert=(
+            """
+            <unitid></unitid>
+            """
+        ),
+        parent_element="did",
+    )
+    assert Ead.get_identifiers(source_record) is None
+
+
+def test_get_identifiers_transforms_correctly_if_fields_missing():
+    source_record = create_ead_source_record_stub(
+        parent_element="did",
+    )
+    assert Ead.get_identifiers(source_record) is None
+
+
+def test_get_identifiers_transforms_correctly_if_type_attribute_invalid():
+    source_record = create_ead_source_record_stub(
+        metadata_insert=(
+            """
+            <unitid type="aspace_uri">ignore-me</unitid>
+            """
+        ),
+        parent_element="did",
+    )
+    assert Ead.get_identifiers(source_record) is None
+
+
 def test_get_main_titles_success():
     source_record = create_ead_source_record_stub(
         metadata_insert=(
diff --git a/transmogrifier/sources/xml/ead.py b/transmogrifier/sources/xml/ead.py
index 970f455..6fb3366 100644
--- a/transmogrifier/sources/xml/ead.py
+++ b/transmogrifier/sources/xml/ead.py
@@ -29,8 +29,6 @@ def get_optional_fields(self, source_record: Tag) -> dict | None:
         """
         fields: dict = {}
 
-        source_record_id = self.get_source_record_id(source_record)
-
         # <archdesc> and <did> elements are required when deriving optional fields
         collection_description = self._get_collection_description(source_record)
         collection_description_did = self._get_collection_description_did(source_record)
@@ -48,33 +46,15 @@ def get_optional_fields(self, source_record: Tag) -> dict | None:
 
         # content_type
         fields["content_type"] = self.get_content_type(source_record)
+
         # contents
-        for arrangement_element in collection_description.find_all(
-            "arrangement", recursive=False
-        ):
-            for arrangement_value in self.create_list_from_mixed_value(
-                arrangement_element, skipped_elements=["head"]
-            ):
-                fields.setdefault("contents", []).append(arrangement_value)
+        fields["contents"] = self.get_contents(source_record)
 
         # contributors
-        for origination_element in collection_description_did.find_all("origination"):
-            for name_element in origination_element.find_all(name=True, recursive=False):
-                if name_value := self.create_string_from_mixed_value(
-                    name_element, separator=" "
-                ):
-                    fields.setdefault("contributors", []).append(
-                        timdex.Contributor(
-                            value=name_value,
-                            kind=origination_element.get("label") or None,
-                            identifier=self.generate_name_identifier_url(name_element),
-                        )
-                    )
+        fields["contributors"] = self.get_contributors(source_record)
 
         # dates
-        dates = self.parse_dates(collection_description_did, source_record_id)
-        if dates:
-            fields.setdefault("dates", []).extend(dates)
+        fields["dates"] = self.get_dates(source_record)
 
         # edition field not used in EAD
 
@@ -88,16 +68,7 @@ def get_optional_fields(self, source_record: Tag) -> dict | None:
         # holdings omitted pending discussion on how to map originalsloc and physloc
 
         # identifiers
-        for id_element in collection_description_did.find_all("unitid", recursive=False):
-            if id_element.get("type") == "aspace_uri":
-                continue
-            if id_value := self.create_string_from_mixed_value(
-                id_element,
-                separator=" ",
-            ):
-                fields.setdefault("identifiers", []).append(
-                    timdex.Identifier(value=id_value, kind="Collection Identifier")
-                )
+        fields["identifiers"] = self.get_identifiers(source_record)
 
         # languages
         for langmaterial_element in collection_description_did.find_all(
@@ -306,28 +277,6 @@ def create_string_from_mixed_value(
             cls.create_list_from_mixed_value(xml_element, skipped_elements)
         )
 
-    @classmethod
-    def generate_name_identifier_url(cls, name_element: Tag) -> list | None:
-        """
-        Generate a full name identifier URL with the specified scheme.
-
-        Args:
-            name_element: A BeautifulSoup Tag representing an EAD
-                name XML field.
-        """
-        if identifier := name_element.get("authfilenumber"):
-            source = name_element.get("source")
-            if source in ["lcnaf", "naf"]:
-                base_url = "https://lccn.loc.gov/"
-            elif source == "snac":
-                base_url = "https://snaccooperative.org/view/"
-            elif source == "viaf":
-                base_url = "http://viaf.org/viaf/"
-            else:
-                base_url = ""
-            return [base_url + identifier]
-        return None
-
     @classmethod
     def _get_collection_description(cls, source_record: Tag) -> Tag:
         """Get element with archival description for a collection.
@@ -425,6 +374,156 @@ def get_content_type(cls, source_record: Tag) -> list[str] | None:
 
         return content_types or None
 
+    @classmethod
+    def get_contents(cls, source_record: Tag) -> list[str] | None:
+        contents = []
+        collection_description = cls._get_collection_description(source_record)
+        for arrangement_element in collection_description.find_all(
+            "arrangement", recursive=False
+        ):
+            contents.extend(
+                cls.create_list_from_mixed_value(
+                    arrangement_element, skipped_elements=["head"]
+                )
+            )
+        return contents or None
+
+    @classmethod
+    def get_contributors(cls, source_record: Tag) -> list[timdex.Contributor] | None:
+        contributors = []
+        collection_description_did = cls._get_collection_description_did(source_record)
+        for origination_element in collection_description_did.find_all("origination"):
+            contributors.extend(
+                [
+                    timdex.Contributor(
+                        value=contributor_name,
+                        kind=origination_element.get("label") or None,
+                        identifier=cls._get_contributor_identifier_url(
+                            contributor_element
+                        ),
+                    )
+                    for contributor_element in origination_element.find_all(
+                        name=True, recursive=False
+                    )
+                    if (
+                        contributor_name := cls.create_string_from_mixed_value(
+                            contributor_element, separator=" "
+                        )
+                    )
+                ]
+            )
+
+        return contributors or None
+
+    @classmethod
+    def _get_contributor_identifier_url(cls, name_element: Tag) -> list | None:
+        """Generate a full name identifier URL with the specified scheme.
+
+        Args:
+            name_element: A BeautifulSoup Tag representing an EAD
+                name XML field.
+        """
+        if identifier := name_element.get("authfilenumber"):
+            source = name_element.get("source")
+            if source in ["lcnaf", "naf"]:
+                base_url = "https://lccn.loc.gov/"
+            elif source == "snac":
+                base_url = "https://snaccooperative.org/view/"
+            elif source == "viaf":
+                base_url = "http://viaf.org/viaf/"
+            else:
+                base_url = ""
+            return [base_url + identifier]
+        return None
+
+    @classmethod
+    def get_dates(cls, source_record: Tag) -> list[timdex.Date] | None:
+        dates = []
+        collection_description_did = cls._get_collection_description_did(source_record)
+        source_record_id = cls.get_source_record_id(source_record)
+        dates.extend(
+            cls._parse_date_elements(collection_description_did, source_record_id)
+        )
+        return dates or None
+
+    @classmethod
+    def _parse_date_elements(
+        cls, collection_description_did: Tag, source_record_id: str
+    ) -> list[timdex.Date]:
+        """
+        Dates are derived from <archdesc><unitdate> elements with the @normal attribute.
+
+        The method will iterate over these elements, validating the values assigned to
+        the @normal attribute. If the date value passes validation, a timdex.Date
+        instance is created and added to the list of valid timdex.Dates that is
+        returned.
+
+        Note: If the date values in a provided date range are the same, a
+          single date is retrieved.
+        """
+        dates = []
+        for date_element in collection_description_did.find_all("unitdate", normal=True):
+            date_string = date_element.get("normal", "").strip()
+
+            if date_string == "":
+                continue
+
+            date_instance = timdex.Date()
+
+            # get valid date ranges or dates
+            if "/" in date_string:
+                date_instance = cls._parse_date_range(date_string, source_record_id)
+            elif validate_date(date_string, source_record_id):
+                date_instance.value = date_string
+
+            # include @datechar and @certainty attributes
+            date_instance.kind = date_element.get("datechar")
+            date_instance.note = date_element.get("certainty")
+
+            if date_instance.range or date_instance.value:
+                dates.append(date_instance)
+        return dates
+
+    @classmethod
+    def _parse_date_range(cls, date_string: str, source_record_id: str) -> timdex.Date:
+        date_instance = timdex.Date()
+        gte_date, lte_date = date_string.split("/")
+        if gte_date != lte_date:
+            if validate_date_range(
+                gte_date,
+                lte_date,
+                source_record_id,
+            ):
+                date_instance.range = timdex.DateRange(
+                    gte=gte_date,
+                    lte=lte_date,
+                )
+        else:
+            # get valid date (if dates in ranges are the same)
+            date_string = gte_date
+            if validate_date(
+                date_string,
+                source_record_id,
+            ):
+                date_instance.value = date_string
+        return date_instance
+
+    @classmethod
+    def get_identifiers(cls, source_record: Tag) -> list[timdex.Identifier] | None:
+        identifiers = []
+        collection_description_did = cls._get_collection_description_did(source_record)
+        for id_element in collection_description_did.find_all("unitid", recursive=False):
+            if id_element.get("type") == "aspace_uri":
+                continue
+            if id_value := cls.create_string_from_mixed_value(
+                id_element,
+                separator=" ",
+            ):
+                identifiers.append(
+                    timdex.Identifier(value=id_value, kind="Collection Identifier")
+                )
+        return identifiers or None
+
     @classmethod
     def get_main_titles(cls, source_record: Tag) -> list[str]:
         """
@@ -486,54 +585,3 @@ def parse_mixed_value(
         elif isinstance(item, Tag) and item.name not in skipped_elements:
             for child in item.children:
                 yield from cls.parse_mixed_value(child, skipped_elements)
-
-    def parse_dates(
-        self, collection_description_did: Tag, source_record_id: str
-    ) -> list[timdex.Date]:
-        """
-        Dedicated method to parse dates.  Targeting archdesc.unitdata elements, using
-        only those with a @normal attribute value.  These are almost uniformly ranges,
-        but in the event they are not (or two identical values for the range) a single
-        date value is produced.
-        """
-        dates = []
-        for date_element in collection_description_did.find_all("unitdate"):
-            normal_date = date_element.get("normal", "").strip()
-            if normal_date == "":
-                continue
-
-            date_instance = timdex.Date()
-
-            # date range
-            if "/" in normal_date:
-                gte_date, lte_date = normal_date.split("/")
-                if gte_date != lte_date:
-                    if validate_date_range(
-                        gte_date,
-                        lte_date,
-                        source_record_id,
-                    ):
-                        date_instance.range = timdex.DateRange(
-                            gte=gte_date,
-                            lte=lte_date,
-                        )
-                else:
-                    date_str = gte_date  # arbitrarily take one
-                    if validate_date(
-                        date_str,
-                        source_record_id,
-                    ):
-                        date_instance.value = date_str
-
-            # fallback on single date
-            elif validate_date(normal_date, source_record_id):
-                date_instance.value = normal_date
-
-            # include @datechar and @certainty attributes
-            date_instance.kind = date_element.get("datechar")
-            date_instance.note = date_element.get("certainty")
-
-            if date_instance.range or date_instance.value:
-                dates.append(date_instance)
-
-        return dates