diff --git a/catalog/dags/providers/provider_api_scripts/nypl.py b/catalog/dags/providers/provider_api_scripts/nypl.py index 9c436ee232b..8f180628dce 100644 --- a/catalog/dags/providers/provider_api_scripts/nypl.py +++ b/catalog/dags/providers/provider_api_scripts/nypl.py @@ -44,7 +44,7 @@ def get_value_from_dict_or_list( class NyplDataIngester(ProviderDataIngester): providers = {"image": prov.NYPL_DEFAULT_PROVIDER} - endpoint_base = "http://api.repo.nypl.org/api/v1/items" + endpoint_base = "http://api.repo.nypl.org/api/v2/items" endpoint = f"{endpoint_base}/search/" metadata_endpoint = f"{endpoint_base}/item_details/" batch_limit = 500 @@ -138,6 +138,7 @@ def get_record_data(self, data): "filetype": filetype, "category": category, "meta_data": metadata, + "raw_tags": NyplDataIngester._get_tags(mods) or None, } images.append(image_data) return images @@ -221,6 +222,22 @@ def _get_creators(creatorinfo): return info.get("namePart", {}).get("$") return None + @staticmethod + def _get_tags(mods: dict) -> list[str]: + subject_list = mods.get("subject", []) + if isinstance(subject_list, dict): + subject_list = [subject_list] + # Topic can be a dictionary or a list + topics = [subject["topic"] for subject in subject_list if "topic" in subject] + tags = [] + if topics: + for topic in topics: + if isinstance(topic, list): + tags.extend([t.get("$") for t in topic]) + else: + tags.append(topic.get("$")) + return [tag for tag in tags if tag] + @staticmethod def _get_type_of_resource(mods: dict) -> str | None: type_of_resource = mods.get("typeOfResource", {}) @@ -279,21 +296,6 @@ def _get_metadata(mods): ): metadata["physical_description"] = physical_description - subject_list = mods.get("subject", []) - if isinstance(subject_list, dict): - subject_list = [subject_list] - # Topic can be a dictionary or a list - topics = [subject["topic"] for subject in subject_list if "topic" in subject] - if topics: - tags = [] - for topic in topics: - if isinstance(topic, list): - tags.extend([t.get("$") for t in topic]) - else: - tags.append(topic.get("$")) - if tags: - metadata["tags"] = ", ".join(tags) - return metadata diff --git a/catalog/justfile b/catalog/justfile index e56f644fff6..8e261fde859 100644 --- a/catalog/justfile +++ b/catalog/justfile @@ -82,12 +82,12 @@ shell: env DC_USER="airflow" just ../exec {{ SERVICE }} /bin/bash # Launch an IPython shell in a new container under `SERVICE` -ipython: up-deps +ipython *args: up-deps env DC_USER="airflow" just ../run \ --rm \ --workdir /opt/airflow/catalog/dags \ {{ SERVICE }} \ - bash -c \'ipython\' + bash -c \'ipython {{ args }}\' # Launch a `pgcli` shell in the PostgreSQL container pgcli db_user_pass="deploy" db_name="openledger": up diff --git a/catalog/tests/dags/providers/provider_api_scripts/resources/nypl/metadata.json b/catalog/tests/dags/providers/provider_api_scripts/resources/nypl/metadata.json index a99515fec99..7d6d8ec3fc5 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/resources/nypl/metadata.json +++ b/catalog/tests/dags/providers/provider_api_scripts/resources/nypl/metadata.json @@ -3,6 +3,5 @@ "genre": "Maps", "physical_description": "4 polyester film encapsulations, some containing 2 sheets back-to-back. Accompanying text formatted as 1 large sheet (46 x 59 cm), in one of the encapsulations.", "publisher": "New York Public Library, Local History and Genealogy Division", - "tags": "Census districts", "type_of_resource": "cartographic" } diff --git a/catalog/tests/dags/providers/provider_api_scripts/resources/nypl/response_search_success.json b/catalog/tests/dags/providers/provider_api_scripts/resources/nypl/response_search_success.json index cd28d62c440..36ae802fbf4 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/resources/nypl/response_search_success.json +++ b/catalog/tests/dags/providers/provider_api_scripts/resources/nypl/response_search_success.json @@ -17,7 +17,7 @@ "numResults": "1275", "result": [ { - "apiUri": "http://api.repo.nypl.org/api/v1/items/mods/0cabe3d0-3d50-0134-a8e0-00505686a51c", + "apiUri": "http://api.repo.nypl.org/api/v2/items/mods/0cabe3d0-3d50-0134-a8e0-00505686a51c", "imageID": "56738462", "itemLink": "http://digitalcollections.nypl.org/items/0cabe3d0-3d50-0134-a8e0-00505686a51c", "rightsStatement": "To the extent that a jurisdiction grants The New York Public Library a copyright in this item, NYPL makes this item available under a Creative Commons CC0 1.0 Universal Public Domain Dedication. Though not required, if you want to credit us as the source, please use the following statement, \"From The New York Public Library,\" and provide a link back to the item on our Digital Collections site. Doing so helps us track how our collection is used and helps justify freely releasing even more content in the future.", diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_nypl.py b/catalog/tests/dags/providers/provider_api_scripts/test_nypl.py index fd738f5dafd..33edd6ca58a 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/test_nypl.py +++ b/catalog/tests/dags/providers/provider_api_scripts/test_nypl.py @@ -80,6 +80,26 @@ def test_get_creators_failure(): assert actual_creator is None +@pytest.mark.parametrize("subject_container", [lambda x: [x], lambda x: x]) +@pytest.mark.parametrize("topic_container", [lambda x: [x], lambda x: x]) +@pytest.mark.parametrize( + "topic, expected_tags", + [ + # No topics + [{}, []], + # Unrelated topics + [{"Unrelated": "Foo"}, []], + # Relevant topics + [{"$": "value"}, ["value"]], + ], +) +def test_get_tags(subject_container, topic_container, topic, expected_tags): + topics = topic_container(topic) + subject = subject_container({"topic": topics}) + actual_tags = nypl._get_tags({"subject": subject}) + assert actual_tags == expected_tags + + def test_get_metadata(): item_response = _get_resource_json("response_itemdetails_success.json") mods = item_response.get("nyplAPI").get("response").get("mods") @@ -138,12 +158,12 @@ def test_get_record_data_success(): "date_issued": "1981", "genre": "Maps", "publisher": "New York Public Library, Local History and Genealogy Division", - "tags": "Census districts", "type_of_resource": "cartographic", "physical_description": "4 polyester film encapsulations, some containing 2 sheets back-to-back. " "Accompanying text formatted as 1 large sheet (46 x 59 cm), in one of " "the encapsulations.", }, + "raw_tags": ["Census districts"], "title": "1900 census enumeration districts, Manhattan and Bronx", "license_info": CC0, }