From 629b1a8b816a71e9d946304c3ef83e35367ff284 Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Sun, 23 Apr 2023 10:44:26 +0300 Subject: [PATCH] Replace `is None` checks with `not` checks --- .../metropolitan_museum.py | 29 +++---- .../provider_api_scripts/museum_victoria.py | 25 +++--- .../providers/provider_api_scripts/nappy.py | 10 ++- .../providers/provider_api_scripts/nypl.py | 51 ++++++++----- .../provider_data_ingester.py | 4 +- .../test_metropolitan_museum.py | 7 -- .../provider_api_scripts/test_nypl.py | 76 +++++++++++++++++++ 7 files changed, 138 insertions(+), 64 deletions(-) diff --git a/catalog/dags/providers/provider_api_scripts/metropolitan_museum.py b/catalog/dags/providers/provider_api_scripts/metropolitan_museum.py index 071f48434d5..dd02868b4cd 100644 --- a/catalog/dags/providers/provider_api_scripts/metropolitan_museum.py +++ b/catalog/dags/providers/provider_api_scripts/metropolitan_museum.py @@ -30,6 +30,7 @@ from provider_data_ingester import ProviderDataIngester +from common import constants from common.licenses import get_license_info from common.loader import provider_details as prov @@ -81,8 +82,7 @@ def get_record_data(self, object_id): object_json = self.delayed_requester.get_response_json( object_endpoint, self.retries ) - - if not object_json.get("isPublicDomain"): + if not object_json or not object_json.get("isPublicDomain"): self.non_cc0_objects += 1 if self.non_cc0_objects % self.batch_limit == 0: logger.info(f"Retrieved {self.non_cc0_objects} non-CC0 objects.") @@ -131,21 +131,17 @@ def get_should_continue(self, response_json): if response_json: return False - def _get_foreign_id(self, object_id: int, image_url: str): + def _get_foreign_id(self, object_id: int, image_url: str) -> str: unique_identifier = image_url.split("/")[-1].split(".")[0] return f"{object_id}-{unique_identifier}" - def _get_meta_data(self, object_json): - if object_json is None: - return + def _get_meta_data(self, object_json: dict) -> dict | None: if object_json.get("accessionNumber"): return { "accession_number": object_json.get("accessionNumber"), } - def _get_tag_list(self, object_json): - if object_json is None: - return + def _get_tag_list(self, object_json: dict) -> list: tag_list = [ tag for tag in [ @@ -165,19 +161,16 @@ def _get_tag_list(self, object_json): tag_list += [tag["term"] for tag in object_json.get("tags")] return tag_list - def _get_title(self, object_json): - if object_json is not None: - # Use or to skip false-y (empty) titles: "" - return object_json.get("title") or object_json.get("objectName") + def _get_title(self, object_json: dict) -> str | None: + # Use or to skip false-y (empty) titles: "" + return object_json.get("title") or object_json.get("objectName") - def _get_artist_name(self, object_json): - if object_json is None: - return + def _get_artist_name(self, object_json: dict) -> str | None: return object_json.get("artistDisplayName") - def get_media_type(self, object_json): + def get_media_type(self, object_json: dict): # This provider only supports Images. - return "image" + return constants.IMAGE def main(date: str): diff --git a/catalog/dags/providers/provider_api_scripts/museum_victoria.py b/catalog/dags/providers/provider_api_scripts/museum_victoria.py index c08ab8f3c1e..2d0f73505c6 100644 --- a/catalog/dags/providers/provider_api_scripts/museum_victoria.py +++ b/catalog/dags/providers/provider_api_scripts/museum_victoria.py @@ -1,6 +1,7 @@ import logging from typing import TypedDict +from common import constants from common.licenses import LicenseInfo, get_license_info from common.loader import provider_details as prov from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester @@ -64,16 +65,14 @@ def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dic } def get_record_data(self, data: dict): - object_id = data.get("id") - if object_id in self.RECORDS_IDS: + if not (object_id := data.get("id")) or object_id in self.RECORDS_IDS: return None self.RECORDS_IDS.add(object_id) foreign_landing_url = f"{self.LANDING_PAGE}{object_id}" - if (media_data := data.get("media")) is None: - return None - images = self._get_images(media_data) - if len(images) == 0: + if not (media_data := data.get("media")) or not ( + images := self._get_images(media_data) + ): return None meta_data = self._get_metadata(data) title = data.get("displayTitle") @@ -96,17 +95,18 @@ def _get_images(media_data) -> list[ImageDetails]: for media in media_data: if media.get("type") != "image": continue - image_id = media.get("id") + if not (foreign_identifier := media.get("id")): + continue image_url, height, width, filesize = VictoriaDataIngester._get_image_data( media ) license_info = VictoriaDataIngester._get_license_info(media) - if image_url is None or image_id is None or not license_info: + if not image_url or not license_info: continue creator = VictoriaDataIngester._get_creator(media) image: ImageDetails = { - "foreign_identifier": image_id, + "foreign_identifier": foreign_identifier, "image_url": image_url, "height": height, "width": width, @@ -117,7 +117,7 @@ def _get_images(media_data) -> list[ImageDetails]: return images def get_media_type(self, record: dict) -> str: - return "image" + return constants.IMAGE @staticmethod def _get_image_data( @@ -126,11 +126,10 @@ def _get_image_data( height, width, filesize = None, None, None media_data = {} for size in ["large", "medium", "small"]: - if size in media: + if size in media and "uri" in media[size]: media_data = media[size] break - image_url = media_data.get("uri") - if image_url is not None: + if image_url := media_data.get("uri"): height = media_data.get("height") width = media_data.get("width") filesize = media_data.get("size") diff --git a/catalog/dags/providers/provider_api_scripts/nappy.py b/catalog/dags/providers/provider_api_scripts/nappy.py index 6338151546c..d7676408b50 100644 --- a/catalog/dags/providers/provider_api_scripts/nappy.py +++ b/catalog/dags/providers/provider_api_scripts/nappy.py @@ -68,14 +68,16 @@ def _convert_filesize(raw_filesize_string: str) -> int: multiplier = FILETYPE_MULTIPLIERS[stripped[-2:]] return round(units * multiplier) - def get_record_data(self, data: dict) -> dict | list[dict] | None: - if (foreign_landing_url := data.get("foreign_landing_url")) is None: + def get_record_data(self, data: dict) -> dict | None: + if not (foreign_landing_url := data.get("foreign_landing_url")): return None - if (image_url := data.get("url")) is None: + if not (image_url := data.get("url")): + return None + + if not (foreign_identifier := data.get("foreign_identifier")): return None - foreign_identifier = data.get("foreign_identifier") thumbnail_url = data.get("url") + "?auto=format&w=600&q=75" filesize = self._convert_filesize(data.get("filesize")) filetype = data.get("filetype") diff --git a/catalog/dags/providers/provider_api_scripts/nypl.py b/catalog/dags/providers/provider_api_scripts/nypl.py index 48971b2b2e7..2ff7ef842f5 100644 --- a/catalog/dags/providers/provider_api_scripts/nypl.py +++ b/catalog/dags/providers/provider_api_scripts/nypl.py @@ -85,54 +85,53 @@ def get_batch_data(self, response_json): return None def get_record_data(self, data): - uuid = data.get("uuid") + if not (uuid := data.get("uuid")): + return None item_json = ( self.get_response_json({}, endpoint=self.metadata_endpoint + uuid) or {} ) - item_details = item_json.get("nyplAPI", {}).get("response") - if not item_details: + if not (item_details := item_json.get("nyplAPI", {}).get("response")): return None - mods = item_details.get("mods") + mods = item_details.get("mods", {}) - title_info = mods.get("titleInfo") - if isinstance(title_info, list) and title_info: - title_info = title_info[0] - title = "" if title_info is None else title_info.get("title", {}).get("$") + title = NyplDataIngester._get_title(mods) - name_properties = mods.get("name") - creator = self._get_creators(name_properties) if name_properties else None + creator = ( + NyplDataIngester._get_creators(name_properties) + if (name_properties := mods.get("name")) + else None + ) - metadata = self._get_metadata(mods) + metadata = NyplDataIngester._get_metadata(mods) category = ( ImageCategory.PHOTOGRAPH if metadata.get("genre") == "Photographs" else None ) - captures = item_details.get("sibling_captures", {}).get("capture") - if not captures: + if not (captures := NyplDataIngester._get_captures(item_details)): return None - if not isinstance(captures, list): - captures = [captures] + images = [] for capture in captures: - image_id = capture.get("imageID", {}).get("$") - if image_id is None: + if not (foreign_identifier := capture.get("imageID", {}).get("$")): continue image_link = capture.get("imageLinks", {}).get("imageLink", []) - image_url, filetype = self._get_image_data(image_link) + image_url, filetype = NyplDataIngester._get_image_data(image_link) if not image_url: continue foreign_landing_url = capture.get("itemLink", {}).get("$") if not foreign_landing_url: continue + if not (foreign_landing_url := capture.get("itemLink", {}).get("$")): + continue license_url = capture.get("rightsStatementURI", {}).get("$") if not (license_info := get_license_info(license_url)): continue image_data = { - "foreign_identifier": image_id, + "foreign_identifier": foreign_identifier, "foreign_landing_url": foreign_landing_url, "image_url": image_url, "license_info": license_info, @@ -145,6 +144,20 @@ def get_record_data(self, data): images.append(image_data) return images + @staticmethod + def _get_title(mods: dict) -> str: + title_info = mods.get("titleInfo") + if isinstance(title_info, list) and title_info: + title_info = title_info[0] + return "" if title_info is None else title_info.get("title", {}).get("$") + + @staticmethod + def _get_captures(item_details: dict) -> list[dict]: + captures = item_details.get("sibling_captures", {}).get("capture") + if captures and not isinstance(captures, list): + captures = [captures] + return captures + @staticmethod def _get_filetype(description: str): """ diff --git a/catalog/dags/providers/provider_api_scripts/provider_data_ingester.py b/catalog/dags/providers/provider_api_scripts/provider_data_ingester.py index d6ba0cf2848..8c248d1392a 100644 --- a/catalog/dags/providers/provider_api_scripts/provider_data_ingester.py +++ b/catalog/dags/providers/provider_api_scripts/provider_data_ingester.py @@ -444,9 +444,7 @@ def process_batch(self, media_batch) -> int: processed_count = 0 for data in media_batch: - record_data = self.get_record_data(data) - - if record_data is None: + if not (record_data := self.get_record_data(data)): continue record_data = ( diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py b/catalog/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py index 434db312d87..f84903a309e 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py +++ b/catalog/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py @@ -82,8 +82,6 @@ def test_get_batch_data(response_json, expected): full_expected_data[0].get("meta_data"), id="full_object", ), - pytest.param({}, None, id="empty_dict"), - pytest.param(None, None, id="None"), ], ) def test_get_meta_data(response_json, expected): @@ -104,8 +102,6 @@ def test_get_meta_data(response_json, expected): full_expected_data[0].get("raw_tags"), id="full_object", ), - pytest.param({}, [], id="empty_dict"), - pytest.param(None, None, id="None"), ], ) def test_get_tag_list(response_json, expected): @@ -131,8 +127,6 @@ def test_get_tag_list(response_json, expected): "Yes, empty title", id="empty_string_title", ), - pytest.param({}, None, id="empty_json"), - pytest.param(None, None, id="None"), ], ) def test_get_title(response_json, expected): @@ -144,7 +138,6 @@ def test_get_title(response_json, expected): "response_json, expected", [ pytest.param({}, None, id="empty_json"), - pytest.param(None, None, id="None"), pytest.param( {"artistDisplayName": "Unidentified flying obj"}, "Unidentified flying obj", diff --git a/catalog/tests/dags/providers/provider_api_scripts/test_nypl.py b/catalog/tests/dags/providers/provider_api_scripts/test_nypl.py index 9dce6f680c0..41eb06e0af4 100644 --- a/catalog/tests/dags/providers/provider_api_scripts/test_nypl.py +++ b/catalog/tests/dags/providers/provider_api_scripts/test_nypl.py @@ -160,6 +160,82 @@ def test_get_record_data_failure(): assert images is None +def test_get_record_data_missing_uuid_returns_none(): + _get_resource_json("response_search_success.json") + + result = {"uuid": ""} + images = nypl.get_record_data(result) + assert images is None + + +def test_get_record_data_missing_item_details_returns_none(): + _get_resource_json("response_search_success.json") + result = {"nyplAPI": {"response": {}}} + + item_response = None + with patch.object(nypl, "get_response_json", return_value=item_response): + images = nypl.get_record_data(result) + assert images is None + + +def test_get_record_data_missing_captures_returns_none(): + _get_resource_json("response_search_success.json") + result = {"nyplAPI": {"response": {"sibling_captures": {"capture": []}}}} + + item_response = None + with patch.object(nypl, "get_response_json", return_value=item_response): + images = nypl.get_record_data(result) + assert images is None + + +@pytest.mark.parametrize( + "falsy_property, api_param", + [ + pytest.param("foreign_identifier", "imageID", id="foreign_identifier-imageID"), + pytest.param( + "foreign_landing_url", "itemLink", id="foreign_landing_url-itemLink" + ), + pytest.param( + "license_info", "rightsStatementURI", id="license_info-rightsStatementURI" + ), + ], +) +def test_get_record_data_missing_required_params_returns_empty_list( + falsy_property, api_param +): + search_response = _get_resource_json("response_search_success.json") + result = search_response["nyplAPI"]["response"]["result"][0] + + item_response = _get_resource_json("response_itemdetails_success.json") + test_capture = item_response["nyplAPI"]["response"]["sibling_captures"]["capture"][ + 0 + ] + test_capture[api_param]["$"] = "" + item_response["nyplAPI"]["response"]["sibling_captures"]["capture"] = [test_capture] + + with patch.object(nypl, "get_response_json", return_value=item_response): + images = nypl.get_record_data(result) + assert images == [] + + +def test_get_record_data_missing_image_urls_returns_empty_dictionary( + falsy_property, api_param +): + search_response = _get_resource_json("response_search_success.json") + result = search_response["nyplAPI"]["response"]["result"][0] + + item_response = _get_resource_json("response_itemdetails_success.json") + test_capture = item_response["nyplAPI"]["response"]["sibling_captures"]["capture"][ + 0 + ] + test_capture["imageLinks"]["imageLink"] = [] + item_response["nyplAPI"]["response"]["sibling_captures"]["capture"] = [test_capture] + + with patch.object(nypl, "get_response_json", return_value=item_response): + images = nypl.get_record_data(result) + assert images == [] + + @pytest.mark.parametrize( "dict_or_list, keys, expected", [