diff --git a/catalog/dags/providers/provider_api_scripts/metropolitan_museum.py b/catalog/dags/providers/provider_api_scripts/metropolitan_museum.py index 071f48434d5..dd02868b4cd 100644 --- a/catalog/dags/providers/provider_api_scripts/metropolitan_museum.py +++ b/catalog/dags/providers/provider_api_scripts/metropolitan_museum.py @@ -30,6 +30,7 @@ from provider_data_ingester import ProviderDataIngester +from common import constants from common.licenses import get_license_info from common.loader import provider_details as prov @@ -81,8 +82,7 @@ def get_record_data(self, object_id): object_json = self.delayed_requester.get_response_json( object_endpoint, self.retries ) - - if not object_json.get("isPublicDomain"): + if not object_json or not object_json.get("isPublicDomain"): self.non_cc0_objects += 1 if self.non_cc0_objects % self.batch_limit == 0: logger.info(f"Retrieved {self.non_cc0_objects} non-CC0 objects.") @@ -131,21 +131,17 @@ def get_should_continue(self, response_json): if response_json: return False - def _get_foreign_id(self, object_id: int, image_url: str): + def _get_foreign_id(self, object_id: int, image_url: str) -> str: unique_identifier = image_url.split("/")[-1].split(".")[0] return f"{object_id}-{unique_identifier}" - def _get_meta_data(self, object_json): - if object_json is None: - return + def _get_meta_data(self, object_json: dict) -> dict | None: if object_json.get("accessionNumber"): return { "accession_number": object_json.get("accessionNumber"), } - def _get_tag_list(self, object_json): - if object_json is None: - return + def _get_tag_list(self, object_json: dict) -> list: tag_list = [ tag for tag in [ @@ -165,19 +161,16 @@ def _get_tag_list(self, object_json): tag_list += [tag["term"] for tag in object_json.get("tags")] return tag_list - def _get_title(self, object_json): - if object_json is not None: - # Use or to skip false-y (empty) titles: "" - return object_json.get("title") or object_json.get("objectName") + def _get_title(self, object_json: dict) -> str | None: + # Use or to skip false-y (empty) titles: "" + return object_json.get("title") or object_json.get("objectName") - def _get_artist_name(self, object_json): - if object_json is None: - return + def _get_artist_name(self, object_json: dict) -> str | None: return object_json.get("artistDisplayName") - def get_media_type(self, object_json): + def get_media_type(self, object_json: dict): # This provider only supports Images. - return "image" + return constants.IMAGE def main(date: str): diff --git a/catalog/dags/providers/provider_api_scripts/museum_victoria.py b/catalog/dags/providers/provider_api_scripts/museum_victoria.py index c08ab8f3c1e..148542f021f 100644 --- a/catalog/dags/providers/provider_api_scripts/museum_victoria.py +++ b/catalog/dags/providers/provider_api_scripts/museum_victoria.py @@ -1,6 +1,7 @@ import logging from typing import TypedDict +from common import constants from common.licenses import LicenseInfo, get_license_info from common.loader import provider_details as prov from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester @@ -64,16 +65,14 @@ def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dic } def get_record_data(self, data: dict): - object_id = data.get("id") - if object_id in self.RECORDS_IDS: + if not (object_id := data.get("id")) or object_id in self.RECORDS_IDS: return None self.RECORDS_IDS.add(object_id) foreign_landing_url = f"{self.LANDING_PAGE}{object_id}" - if (media_data := data.get("media")) is None: - return None - images = self._get_images(media_data) - if len(images) == 0: + if not (media_data := data.get("media")) or not ( + images := self._get_images(media_data) + ): return None meta_data = self._get_metadata(data) title = data.get("displayTitle") @@ -96,17 +95,18 @@ def _get_images(media_data) -> list[ImageDetails]: for media in media_data: if media.get("type") != "image": continue - image_id = media.get("id") + if not (foreign_identifier := media.get("id")): + continue image_url, height, width, filesize = VictoriaDataIngester._get_image_data( media ) license_info = VictoriaDataIngester._get_license_info(media) - if image_url is None or image_id is None or not license_info: + if not image_url or not license_info: continue creator = VictoriaDataIngester._get_creator(media) image: ImageDetails = { - "foreign_identifier": image_id, + "foreign_identifier": foreign_identifier, "image_url": image_url, "height": height, "width": width, @@ -117,7 +117,7 @@ def _get_images(media_data) -> list[ImageDetails]: return images def get_media_type(self, record: dict) -> str: - return "image" + return constants.IMAGE @staticmethod def _get_image_data( @@ -126,11 +126,10 @@ def _get_image_data( height, width, filesize = None, None, None media_data = {} for size in ["large", "medium", "small"]: - if size in media: + if isinstance(media.get(size), dict) and media[size].get("uri"): media_data = media[size] break - image_url = media_data.get("uri") - if image_url is not None: + if image_url := media_data.get("uri"): height = media_data.get("height") width = media_data.get("width") filesize = media_data.get("size") diff --git a/catalog/dags/providers/provider_api_scripts/nappy.py b/catalog/dags/providers/provider_api_scripts/nappy.py index 6338151546c..b6585c71766 100644 --- a/catalog/dags/providers/provider_api_scripts/nappy.py +++ b/catalog/dags/providers/provider_api_scripts/nappy.py @@ -26,7 +26,7 @@ class NappyDataIngester(ProviderDataIngester): endpoint = "https://api.nappy.co/v1/openverse/images" headers = {"User-Agent": prov.UA_STRING, "Accept": "application/json"} - # Hardoded to CC0, the only license Nappy.co uses + # Hardcoded to CC0, the only license Nappy.co uses license_info = get_license_info( "https://creativecommons.org/publicdomain/zero/1.0/" ) @@ -68,14 +68,16 @@ def _convert_filesize(raw_filesize_string: str) -> int: multiplier = FILETYPE_MULTIPLIERS[stripped[-2:]] return round(units * multiplier) - def get_record_data(self, data: dict) -> dict | list[dict] | None: - if (foreign_landing_url := data.get("foreign_landing_url")) is None: + def get_record_data(self, data: dict) -> dict | None: + if not (foreign_landing_url := data.get("foreign_landing_url")): return None - if (image_url := data.get("url")) is None: + if not (image_url := data.get("url")): + return None + + if not (foreign_identifier := data.get("foreign_identifier")): return None - foreign_identifier = data.get("foreign_identifier") thumbnail_url = data.get("url") + "?auto=format&w=600&q=75" filesize = self._convert_filesize(data.get("filesize")) filetype = data.get("filetype") diff --git a/catalog/dags/providers/provider_api_scripts/nypl.py b/catalog/dags/providers/provider_api_scripts/nypl.py index 48971b2b2e7..55c85df629a 100644 --- a/catalog/dags/providers/provider_api_scripts/nypl.py +++ b/catalog/dags/providers/provider_api_scripts/nypl.py @@ -85,54 +85,53 @@ def get_batch_data(self, response_json): return None def get_record_data(self, data): - uuid = data.get("uuid") + if not (uuid := data.get("uuid")): + return None item_json = ( self.get_response_json({}, endpoint=self.metadata_endpoint + uuid) or {} ) - item_details = item_json.get("nyplAPI", {}).get("response") - if not item_details: + if not (item_details := item_json.get("nyplAPI", {}).get("response")): return None - mods = item_details.get("mods") + mods = item_details.get("mods", {}) - title_info = mods.get("titleInfo") - if isinstance(title_info, list) and title_info: - title_info = title_info[0] - title = "" if title_info is None else title_info.get("title", {}).get("$") + title = NyplDataIngester._get_title(mods) - name_properties = mods.get("name") - creator = self._get_creators(name_properties) if name_properties else None + creator = ( + NyplDataIngester._get_creators(name_properties) + if (name_properties := mods.get("name")) + else None + ) - metadata = self._get_metadata(mods) + metadata = NyplDataIngester._get_metadata(mods) category = ( ImageCategory.PHOTOGRAPH if metadata.get("genre") == "Photographs" else None ) - captures = item_details.get("sibling_captures", {}).get("capture") - if not captures: + if not (captures := NyplDataIngester._get_captures(item_details)): return None - if not isinstance(captures, list): - captures = [captures] + images = [] for capture in captures: - image_id = capture.get("imageID", {}).get("$") - if image_id is None: + if not (foreign_identifier := capture.get("imageID", {}).get("$")): continue image_link = capture.get("imageLinks", {}).get("imageLink", []) - image_url, filetype = self._get_image_data(image_link) + image_url, filetype = NyplDataIngester._get_image_data(image_link) if not image_url: continue foreign_landing_url = capture.get("itemLink", {}).get("$") if not foreign_landing_url: continue + if not (foreign_landing_url := capture.get("itemLink", {}).get("$")): + continue license_url = capture.get("rightsStatementURI", {}).get("$") if not (license_info := get_license_info(license_url)): continue image_data = { - "foreign_identifier": image_id, + "foreign_identifier": foreign_identifier, "foreign_landing_url": foreign_landing_url, "image_url": image_url, "license_info": license_info, @@ -145,6 +144,20 @@ def get_record_data(self, data): images.append(image_data) return images + @staticmethod + def _get_title(mods: dict) -> str: + title_info = mods.get("titleInfo") + if isinstance(title_info, list) and title_info: + title_info = title_info[0] + return "" if not title_info else title_info.get("title", {}).get("$") + + @staticmethod + def _get_captures(item_details: dict) -> list[dict]: + captures = item_details.get("sibling_captures", {}).get("capture") + if captures and not isinstance(captures, list): + captures = [captures] + return captures + @staticmethod def _get_filetype(description: str): """ diff --git a/catalog/dags/providers/provider_api_scripts/provider_data_ingester.py b/catalog/dags/providers/provider_api_scripts/provider_data_ingester.py index d6ba0cf2848..8c248d1392a 100644 --- a/catalog/dags/providers/provider_api_scripts/provider_data_ingester.py +++ b/catalog/dags/providers/provider_api_scripts/provider_data_ingester.py @@ -444,9 +444,7 @@ def process_batch(self, media_batch) -> int: processed_count = 0 for data in media_batch: - record_data = self.get_record_data(data) - - if record_data is None: + if not (record_data := self.get_record_data(data)): continue record_data = ( diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py b/catalog/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py index 434db312d87..f84903a309e 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py +++ b/catalog/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py @@ -82,8 +82,6 @@ def test_get_batch_data(response_json, expected): full_expected_data[0].get("meta_data"), id="full_object", ), - pytest.param({}, None, id="empty_dict"), - pytest.param(None, None, id="None"), ], ) def test_get_meta_data(response_json, expected): @@ -104,8 +102,6 @@ def test_get_meta_data(response_json, expected): full_expected_data[0].get("raw_tags"), id="full_object", ), - pytest.param({}, [], id="empty_dict"), - pytest.param(None, None, id="None"), ], ) def test_get_tag_list(response_json, expected): @@ -131,8 +127,6 @@ def test_get_tag_list(response_json, expected): "Yes, empty title", id="empty_string_title", ), - pytest.param({}, None, id="empty_json"), - pytest.param(None, None, id="None"), ], ) def test_get_title(response_json, expected): @@ -144,7 +138,6 @@ def test_get_title(response_json, expected): "response_json, expected", [ pytest.param({}, None, id="empty_json"), - pytest.param(None, None, id="None"), pytest.param( {"artistDisplayName": "Unidentified flying obj"}, "Unidentified flying obj", diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_museum_victoria.py b/catalog/tests/dags/providers/provider_api_scripts/test_museum_victoria.py index b848657f860..bfdb3c7e1a0 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/test_museum_victoria.py +++ b/catalog/tests/dags/providers/provider_api_scripts/test_museum_victoria.py @@ -119,6 +119,18 @@ def test_no_duplicate_records(): assert actual_image_data is None +@pytest.mark.parametrize( + "falsy_parameter", + ["id", "media"], +) +def test_get_record_data_returns_none_with_falsy_param(falsy_parameter): + media = _get_resource_json("record_data.json") + media[falsy_parameter] = "" + actual_image_data = mv.get_record_data(media) + + assert actual_image_data is None + + def test_get_images_success(): media = _get_resource_json("media_data_success.json") actual_image_data = mv._get_images([media]) @@ -144,52 +156,29 @@ def test_get_media_info_failure(): assert actual_image_data is None -def test_get_image_data_large(): - image_data = _get_resource_json("large_image_data.json") - - actual_image_url, actual_height, actual_width, actual_filesize = mv._get_image_data( - image_data - ) - - assert actual_image_url == ( - "https://collections.museumsvictoria.com.au/content/media/45/" - "329745-large.jpg" - ) - assert actual_height == 2581 - assert actual_width == 2785 - assert actual_filesize == 890933 - - -def test_get_image_data_medium(): - image_data = _get_resource_json("medium_image_data.json") - - actual_image_url, actual_height, actual_width, actual_filesize = mv._get_image_data( - image_data - ) - - assert actual_image_url == ( - "https://collections.museumsvictoria.com.au/content/media/45/" - "329745-medium.jpg" +@pytest.mark.parametrize( + "image_size, expected_height, expected_width, expected_filesize", + [ + pytest.param("large", 2581, 2785, 890933, id="large"), + pytest.param("medium", 1390, 1500, 170943, id="medium"), + pytest.param("small", 500, 540, 20109, id="small"), + ], +) +def test_get_image_data(image_size, expected_height, expected_width, expected_filesize): + image_data = _get_resource_json(f"{image_size}_image_data.json") + expected_url = ( + f"https://collections.museumsvictoria.com.au/content/media/45/" + f"329745-{image_size}.jpg" ) - assert actual_height == 1390 - assert actual_width == 1500 - assert actual_filesize == 170943 - - -def test_get_image_data_small(): - image_data = _get_resource_json("small_image_data.json") actual_image_url, actual_height, actual_width, actual_filesize = mv._get_image_data( image_data ) - assert actual_image_url == ( - "https://collections.museumsvictoria.com.au/content/media/45/" - "329745-small.jpg" - ) - assert actual_height == 500 - assert actual_width == 540 - assert actual_filesize == 20109 + assert actual_image_url == expected_url + assert actual_height == expected_height + assert actual_width == expected_width + assert actual_filesize == expected_filesize def test_get_image_data_none(): diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_nappy.py b/catalog/tests/dags/providers/provider_api_scripts/test_nappy.py index 74a531f3e3c..7648a30bfa8 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/test_nappy.py +++ b/catalog/tests/dags/providers/provider_api_scripts/test_nappy.py @@ -89,6 +89,17 @@ def test_get_should_continue(response_json, expected_result): [ pytest.param({}, None, id="empty_dict"), pytest.param(FULL_BATCH_RESPONSE, None, id="no_urls"), + pytest.param( + {**SINGLE_ITEM, "foreign_landing_url": ""}, + None, + id="falsy_foreign_landing_url", + ), + pytest.param( + {**SINGLE_ITEM, "foreign_identifier": ""}, + None, + id="falsy_foreign_identifier", + ), + pytest.param({**SINGLE_ITEM, "url": ""}, None, id="falsy_url"), pytest.param( SINGLE_ITEM, { diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_nypl.py b/catalog/tests/dags/providers/provider_api_scripts/test_nypl.py index 9dce6f680c0..9e360bd97fe 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/test_nypl.py +++ b/catalog/tests/dags/providers/provider_api_scripts/test_nypl.py @@ -160,6 +160,80 @@ def test_get_record_data_failure(): assert images is None +def test_get_record_data_missing_uuid_returns_none(): + _get_resource_json("response_search_success.json") + + result = {"uuid": ""} + images = nypl.get_record_data(result) + assert images is None + + +def test_get_record_data_missing_item_details_returns_none(): + _get_resource_json("response_search_success.json") + result = {"nyplAPI": {"response": {}}} + + item_response = None + with patch.object(nypl, "get_response_json", return_value=item_response): + images = nypl.get_record_data(result) + assert images is None + + +def test_get_record_data_missing_captures_returns_none(): + _get_resource_json("response_search_success.json") + result = {"nyplAPI": {"response": {"sibling_captures": {"capture": []}}}} + + item_response = None + with patch.object(nypl, "get_response_json", return_value=item_response): + images = nypl.get_record_data(result) + assert images is None + + +@pytest.mark.parametrize( + "falsy_property, api_param", + [ + pytest.param("foreign_identifier", "imageID", id="foreign_identifier-imageID"), + pytest.param( + "foreign_landing_url", "itemLink", id="foreign_landing_url-itemLink" + ), + pytest.param( + "license_info", "rightsStatementURI", id="license_info-rightsStatementURI" + ), + ], +) +def test_get_record_data_missing_required_params_returns_empty_list( + falsy_property, api_param +): + search_response = _get_resource_json("response_search_success.json") + result = search_response["nyplAPI"]["response"]["result"][0] + + item_response = _get_resource_json("response_itemdetails_success.json") + test_capture = item_response["nyplAPI"]["response"]["sibling_captures"]["capture"][ + 0 + ] + test_capture[api_param]["$"] = "" + item_response["nyplAPI"]["response"]["sibling_captures"]["capture"] = [test_capture] + + with patch.object(nypl, "get_response_json", return_value=item_response): + images = nypl.get_record_data(result) + assert images == [] + + +def test_get_record_data_missing_image_urls_returns_empty_dictionary(): + search_response = _get_resource_json("response_search_success.json") + result = search_response["nyplAPI"]["response"]["result"][0] + + item_response = _get_resource_json("response_itemdetails_success.json") + test_capture = item_response["nyplAPI"]["response"]["sibling_captures"]["capture"][ + 0 + ] + test_capture["imageLinks"]["imageLink"] = [] + item_response["nyplAPI"]["response"]["sibling_captures"]["capture"] = [test_capture] + + with patch.object(nypl, "get_response_json", return_value=item_response): + images = nypl.get_record_data(result) + assert images == [] + + @pytest.mark.parametrize( "dict_or_list, keys, expected", [ diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_provider_data_ingester.py b/catalog/tests/dags/providers/provider_api_scripts/test_provider_data_ingester.py index b9dc04a5400..32e91421490 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/test_provider_data_ingester.py +++ b/catalog/tests/dags/providers/provider_api_scripts/test_provider_data_ingester.py @@ -177,6 +177,23 @@ def test_process_batch_handles_list_of_records(): assert image_store_mock.call_count == 1 +def test_process_batch_handles_empty_dictionary_of_records_from_get_record_data(): + with ( + patch.object(audio_store, "add_item") as audio_store_mock, + patch.object(image_store, "add_item") as image_store_mock, + patch.object(ingester, "get_record_data") as get_record_data_mock, + ): + # Mock `get_record_data` to return an empty list of records + get_record_data_mock.return_value = [] + + record_count = ingester.process_batch([{}]) + + # Both records are added, and to the appropriate stores + assert record_count == 0 + assert audio_store_mock.call_count == 0 + assert image_store_mock.call_count == 0 + + def test_process_batch_halts_processing_after_reaching_ingestion_limit(): # Set up an ingester with an ingestion limit of 1 ingester = MockProviderDataIngester()