Skip to content

Commit

Permalink
Add checks for required parameters to Provider scripts (K-R) (#1884)
Browse files Browse the repository at this point in the history
* Replace `is None` checks with `not` checks

* Remove unused parameter

* Update tests for nappy, museum_victoria, provider_data_ingester

* Update catalog/dags/providers/provider_api_scripts/museum_victoria.py

* Update catalog/dags/providers/provider_api_scripts/nypl.py
  • Loading branch information
obulat authored May 5, 2023
1 parent e78b0bc commit 7fea347
Show file tree
Hide file tree
Showing 10 changed files with 194 additions and 105 deletions.
29 changes: 11 additions & 18 deletions catalog/dags/providers/provider_api_scripts/metropolitan_museum.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

from provider_data_ingester import ProviderDataIngester

from common import constants
from common.licenses import get_license_info
from common.loader import provider_details as prov

Expand Down Expand Up @@ -81,8 +82,7 @@ def get_record_data(self, object_id):
object_json = self.delayed_requester.get_response_json(
object_endpoint, self.retries
)

if not object_json.get("isPublicDomain"):
if not object_json or not object_json.get("isPublicDomain"):
self.non_cc0_objects += 1
if self.non_cc0_objects % self.batch_limit == 0:
logger.info(f"Retrieved {self.non_cc0_objects} non-CC0 objects.")
Expand Down Expand Up @@ -131,21 +131,17 @@ def get_should_continue(self, response_json):
if response_json:
return False

def _get_foreign_id(self, object_id: int, image_url: str):
def _get_foreign_id(self, object_id: int, image_url: str) -> str:
unique_identifier = image_url.split("/")[-1].split(".")[0]
return f"{object_id}-{unique_identifier}"

def _get_meta_data(self, object_json):
if object_json is None:
return
def _get_meta_data(self, object_json: dict) -> dict | None:
if object_json.get("accessionNumber"):
return {
"accession_number": object_json.get("accessionNumber"),
}

def _get_tag_list(self, object_json):
if object_json is None:
return
def _get_tag_list(self, object_json: dict) -> list:
tag_list = [
tag
for tag in [
Expand All @@ -165,19 +161,16 @@ def _get_tag_list(self, object_json):
tag_list += [tag["term"] for tag in object_json.get("tags")]
return tag_list

def _get_title(self, object_json):
if object_json is not None:
# Use or to skip false-y (empty) titles: ""
return object_json.get("title") or object_json.get("objectName")
def _get_title(self, object_json: dict) -> str | None:
# Use or to skip false-y (empty) titles: ""
return object_json.get("title") or object_json.get("objectName")

def _get_artist_name(self, object_json):
if object_json is None:
return
def _get_artist_name(self, object_json: dict) -> str | None:
return object_json.get("artistDisplayName")

def get_media_type(self, object_json):
def get_media_type(self, object_json: dict):
# This provider only supports Images.
return "image"
return constants.IMAGE


def main(date: str):
Expand Down
25 changes: 12 additions & 13 deletions catalog/dags/providers/provider_api_scripts/museum_victoria.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
from typing import TypedDict

from common import constants
from common.licenses import LicenseInfo, get_license_info
from common.loader import provider_details as prov
from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester
Expand Down Expand Up @@ -64,16 +65,14 @@ def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dic
}

def get_record_data(self, data: dict):
object_id = data.get("id")
if object_id in self.RECORDS_IDS:
if not (object_id := data.get("id")) or object_id in self.RECORDS_IDS:
return None
self.RECORDS_IDS.add(object_id)
foreign_landing_url = f"{self.LANDING_PAGE}{object_id}"

if (media_data := data.get("media")) is None:
return None
images = self._get_images(media_data)
if len(images) == 0:
if not (media_data := data.get("media")) or not (
images := self._get_images(media_data)
):
return None
meta_data = self._get_metadata(data)
title = data.get("displayTitle")
Expand All @@ -96,17 +95,18 @@ def _get_images(media_data) -> list[ImageDetails]:
for media in media_data:
if media.get("type") != "image":
continue
image_id = media.get("id")
if not (foreign_identifier := media.get("id")):
continue
image_url, height, width, filesize = VictoriaDataIngester._get_image_data(
media
)
license_info = VictoriaDataIngester._get_license_info(media)
if image_url is None or image_id is None or not license_info:
if not image_url or not license_info:
continue
creator = VictoriaDataIngester._get_creator(media)

image: ImageDetails = {
"foreign_identifier": image_id,
"foreign_identifier": foreign_identifier,
"image_url": image_url,
"height": height,
"width": width,
Expand All @@ -117,7 +117,7 @@ def _get_images(media_data) -> list[ImageDetails]:
return images

def get_media_type(self, record: dict) -> str:
return "image"
return constants.IMAGE

@staticmethod
def _get_image_data(
Expand All @@ -126,11 +126,10 @@ def _get_image_data(
height, width, filesize = None, None, None
media_data = {}
for size in ["large", "medium", "small"]:
if size in media:
if isinstance(media.get(size), dict) and media[size].get("uri"):
media_data = media[size]
break
image_url = media_data.get("uri")
if image_url is not None:
if image_url := media_data.get("uri"):
height = media_data.get("height")
width = media_data.get("width")
filesize = media_data.get("size")
Expand Down
12 changes: 7 additions & 5 deletions catalog/dags/providers/provider_api_scripts/nappy.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class NappyDataIngester(ProviderDataIngester):
endpoint = "https://api.nappy.co/v1/openverse/images"
headers = {"User-Agent": prov.UA_STRING, "Accept": "application/json"}

# Hardoded to CC0, the only license Nappy.co uses
# Hardcoded to CC0, the only license Nappy.co uses
license_info = get_license_info(
"https://creativecommons.org/publicdomain/zero/1.0/"
)
Expand Down Expand Up @@ -68,14 +68,16 @@ def _convert_filesize(raw_filesize_string: str) -> int:
multiplier = FILETYPE_MULTIPLIERS[stripped[-2:]]
return round(units * multiplier)

def get_record_data(self, data: dict) -> dict | list[dict] | None:
if (foreign_landing_url := data.get("foreign_landing_url")) is None:
def get_record_data(self, data: dict) -> dict | None:
if not (foreign_landing_url := data.get("foreign_landing_url")):
return None

if (image_url := data.get("url")) is None:
if not (image_url := data.get("url")):
return None

if not (foreign_identifier := data.get("foreign_identifier")):
return None

foreign_identifier = data.get("foreign_identifier")
thumbnail_url = data.get("url") + "?auto=format&w=600&q=75"
filesize = self._convert_filesize(data.get("filesize"))
filetype = data.get("filetype")
Expand Down
51 changes: 32 additions & 19 deletions catalog/dags/providers/provider_api_scripts/nypl.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,54 +85,53 @@ def get_batch_data(self, response_json):
return None

def get_record_data(self, data):
uuid = data.get("uuid")
if not (uuid := data.get("uuid")):
return None

item_json = (
self.get_response_json({}, endpoint=self.metadata_endpoint + uuid) or {}
)
item_details = item_json.get("nyplAPI", {}).get("response")
if not item_details:
if not (item_details := item_json.get("nyplAPI", {}).get("response")):
return None
mods = item_details.get("mods")
mods = item_details.get("mods", {})

title_info = mods.get("titleInfo")
if isinstance(title_info, list) and title_info:
title_info = title_info[0]
title = "" if title_info is None else title_info.get("title", {}).get("$")
title = NyplDataIngester._get_title(mods)

name_properties = mods.get("name")
creator = self._get_creators(name_properties) if name_properties else None
creator = (
NyplDataIngester._get_creators(name_properties)
if (name_properties := mods.get("name"))
else None
)

metadata = self._get_metadata(mods)
metadata = NyplDataIngester._get_metadata(mods)
category = (
ImageCategory.PHOTOGRAPH if metadata.get("genre") == "Photographs" else None
)

captures = item_details.get("sibling_captures", {}).get("capture")
if not captures:
if not (captures := NyplDataIngester._get_captures(item_details)):
return None
if not isinstance(captures, list):
captures = [captures]

images = []
for capture in captures:
image_id = capture.get("imageID", {}).get("$")
if image_id is None:
if not (foreign_identifier := capture.get("imageID", {}).get("$")):
continue

image_link = capture.get("imageLinks", {}).get("imageLink", [])
image_url, filetype = self._get_image_data(image_link)
image_url, filetype = NyplDataIngester._get_image_data(image_link)
if not image_url:
continue

foreign_landing_url = capture.get("itemLink", {}).get("$")
if not foreign_landing_url:
continue
if not (foreign_landing_url := capture.get("itemLink", {}).get("$")):
continue
license_url = capture.get("rightsStatementURI", {}).get("$")
if not (license_info := get_license_info(license_url)):
continue

image_data = {
"foreign_identifier": image_id,
"foreign_identifier": foreign_identifier,
"foreign_landing_url": foreign_landing_url,
"image_url": image_url,
"license_info": license_info,
Expand All @@ -145,6 +144,20 @@ def get_record_data(self, data):
images.append(image_data)
return images

@staticmethod
def _get_title(mods: dict) -> str:
title_info = mods.get("titleInfo")
if isinstance(title_info, list) and title_info:
title_info = title_info[0]
return "" if not title_info else title_info.get("title", {}).get("$")

@staticmethod
def _get_captures(item_details: dict) -> list[dict]:
captures = item_details.get("sibling_captures", {}).get("capture")
if captures and not isinstance(captures, list):
captures = [captures]
return captures

@staticmethod
def _get_filetype(description: str):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -444,9 +444,7 @@ def process_batch(self, media_batch) -> int:
processed_count = 0

for data in media_batch:
record_data = self.get_record_data(data)

if record_data is None:
if not (record_data := self.get_record_data(data)):
continue

record_data = (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,6 @@ def test_get_batch_data(response_json, expected):
full_expected_data[0].get("meta_data"),
id="full_object",
),
pytest.param({}, None, id="empty_dict"),
pytest.param(None, None, id="None"),
],
)
def test_get_meta_data(response_json, expected):
Expand All @@ -104,8 +102,6 @@ def test_get_meta_data(response_json, expected):
full_expected_data[0].get("raw_tags"),
id="full_object",
),
pytest.param({}, [], id="empty_dict"),
pytest.param(None, None, id="None"),
],
)
def test_get_tag_list(response_json, expected):
Expand All @@ -131,8 +127,6 @@ def test_get_tag_list(response_json, expected):
"Yes, empty title",
id="empty_string_title",
),
pytest.param({}, None, id="empty_json"),
pytest.param(None, None, id="None"),
],
)
def test_get_title(response_json, expected):
Expand All @@ -144,7 +138,6 @@ def test_get_title(response_json, expected):
"response_json, expected",
[
pytest.param({}, None, id="empty_json"),
pytest.param(None, None, id="None"),
pytest.param(
{"artistDisplayName": "Unidentified flying obj"},
"Unidentified flying obj",
Expand Down
Loading

0 comments on commit 7fea347

Please sign in to comment.