From 323d07bc0786fe4593df1b77fa890ae7f37f5668 Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Thu, 29 Sep 2022 17:44:11 -0400 Subject: [PATCH] Refactor SMK script to use the `ProviderDataIngester` class (#742) Co-authored-by: Madison Swain-Bowden --- .../providers/provider_api_scripts/smk.py | 366 ++++++++---------- .../dags/providers/provider_workflows.py | 2 + .../smk/item_with_alternative_images.json | 188 +++++++++ .../resources/smk/items_batch.json | 106 ----- .../resources/smk/response_failure.json | 8 - .../resources/smk/response_success.json | 114 ------ .../provider_api_scripts/test_smk.py | 237 +++--------- 7 files changed, 400 insertions(+), 621 deletions(-) create mode 100644 tests/dags/providers/provider_api_scripts/resources/smk/item_with_alternative_images.json delete mode 100644 tests/dags/providers/provider_api_scripts/resources/smk/items_batch.json delete mode 100644 tests/dags/providers/provider_api_scripts/resources/smk/response_failure.json delete mode 100644 tests/dags/providers/provider_api_scripts/resources/smk/response_success.json diff --git a/openverse_catalog/dags/providers/provider_api_scripts/smk.py b/openverse_catalog/dags/providers/provider_api_scripts/smk.py index e16fdd013..76871e27b 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/smk.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/smk.py @@ -1,223 +1,177 @@ import logging +from common import constants from common.licenses import get_license_info from common.loader import provider_details as prov -from common.requester import DelayedRequester -from common.storage.image import ImageStore -from requests.exceptions import JSONDecodeError +from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester -logging.basicConfig( - format="%(asctime)s - %(name)s - %(levelname)s: %(message)s", level=logging.INFO -) logger = logging.getLogger(__name__) -LIMIT = 2000 -DELAY = 5 -RETRIES = 3 -PROVIDER = prov.SMK_DEFAULT_PROVIDER -ENDPOINT = "https://api.smk.dk/api/v1/art/search/" -LANDING_PAGE_BASE_URL = "https://open.smk.dk/en/artwork/image/" -IMAGE_SIZE = 2048 -delay_request = DelayedRequester(delay=DELAY) -image_store = ImageStore(provider=PROVIDER) +class SmkDataIngester(ProviderDataIngester): + endpoint = "https://api.smk.dk/api/v1/art/search/" + delay = 5 + batch_limit = 2000 + headers = {"Accept": "application/json"} + providers = {"image": prov.SMK_DEFAULT_PROVIDER} -DEFAULT_QUERY_PARAMS = { - "keys": "*", - "filters": "[has_image:true],[public_domain:true]", - "offset": 0, - "rows": LIMIT, -} + def get_media_type(self, record: dict) -> str: + return constants.IMAGE -HEADERS = {"Accept": "application/json"} + def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict: + if not prev_query_params: + return { + "keys": "*", + "filters": "[has_image:true],[public_domain:true]", + "offset": 0, + "rows": self.batch_limit, + } + return { + **prev_query_params, + "offset": prev_query_params["offset"] + self.batch_limit, + } + + def get_batch_data(self, response_json) -> list: + return response_json.get("items") + + @staticmethod + def _get_foreign_landing_url(item) -> str | None: + """Use the English site instead of the original link.""" + object_num = item.get("object_number") + if not object_num: + logger.info( + f"Image with (foreign) id {item.get('id')} does not have " + "`object_number`! Therefore we cannot build the " + "foreign_landing_url." + ) + return + return f"https://open.smk.dk/en/artwork/image/{object_num}" + + @staticmethod + def _get_image_url(image_iiif_id: str, image_size=2048): + # For high quality IIIF-enabled images, restrict the image size to prevent + # loading very large files. + # TODO: consider just using the full "image_native" when adding the + # "image_thumbnail". + image_url = f"{image_iiif_id}/full/!{image_size},/0/default.jpg" + return image_url + + @staticmethod + def _get_title(item: dict) -> str | None: + titles = item.get("titles") + if not titles or not isinstance(titles, list): + logger.info(f"No title for image with (foreign) id {item.get('id')}.") + return + return titles[0].get("title") + + @staticmethod + def _get_creator(item: dict) -> str | None: + # TODO: review this field, there could be more than one creator or artist. + # Keeping it as it was for the class refactor. + data = item.get("production", []) + if not data or not isinstance(data, list): + return + return data[0].get("creator") + + @staticmethod + def _get_images(item: dict) -> list: + images = [] + + # Legacy images do not have an iiif_id; fall back to the ID from the + # collection DB. + iiif_id = item.get("image_iiif_id") + image_id = iiif_id or item.get("id") + + if image_id is not None: + if iiif_id is None: + # Legacy images do not have IIIF links. + image_url = item.get("image_native") + else: + image_url = SmkDataIngester._get_image_url(iiif_id) + + height = item.get("image_height") + width = item.get("image_width") + filesize = item.get("image_size") or item.get("size") + images.append( + { + "id": image_id, + "image_url": image_url, + "height": height, + "width": width, + "filesize": filesize, + } + ) + + alternative_images = item.get("alternative_images") + if type(alternative_images) == list: + for alt_img in alternative_images: + if type(alt_img) == dict: + iiif_id = alt_img.get("iiif_id") + if iiif_id is None: + # The API for alternative images does not include the + # 'id', so we must skip if `iiif_id` is not present. + continue + image_url = SmkDataIngester._get_image_url(iiif_id) + height = alt_img.get("height") + width = alt_img.get("width") + filesize = alt_img.get("image_size") or alt_img.get("size") + + images.append( + { + "id": iiif_id, + "image_url": image_url, + "height": height, + "width": width, + "filesize": filesize, + } + ) + return images + + @staticmethod + def _get_metadata(item: dict) -> dict: + meta_data = {} + if created_date := item.get("created"): + meta_data["created_date"] = created_date + collection = item.get("collection") + if type(collection) == list: + meta_data["collection"] = ",".join(collection) + techniques = item.get("techniques") + if type(techniques) == list: + meta_data["techniques"] = ",".join(techniques) + colors = item.get("colors") + if type(colors) == list: + meta_data["colors"] = ",".join(colors) + return meta_data + + def get_record_data(self, data: dict) -> dict | list[dict] | None: + license_info = get_license_info(license_url=data.get("rights")) + if license_info is None: + return + images = [] + alt_images = self._get_images(data) + for img in alt_images: + images.append( + { + "foreign_identifier": img.get("id"), + "foreign_landing_url": self._get_foreign_landing_url(data), + "image_url": img.get("image_url"), + "license_info": license_info, + "title": self._get_title(data), + "creator": self._get_creator(data), + "height": img.get("height"), + "width": img.get("width"), + "filesize": img.get("filesize"), + "meta_data": self._get_metadata(data), + } + ) + return images def main(): - condition = True - offset = 0 - while condition: - query_params = _get_query_param(offset=offset) - items = _get_batch_items(query_params=query_params) - if type(items) == list: - if len(items) > 0: - _handle_items_data(items) - offset += LIMIT - else: - condition = False - else: - condition = False - image_count = image_store.commit() - logger.info(f"total images collected {image_count}") - - -def _get_query_param(offset=0, default_query_param=None): - if default_query_param is None: - default_query_param = DEFAULT_QUERY_PARAMS - query_params = default_query_param.copy() - query_params.update(offset=offset) - return query_params - - -def _get_batch_items( - endpoint=ENDPOINT, query_params=None, headers=None, retries=RETRIES -): - if headers is None: - headers = HEADERS.copy() - items = None - for retry in range(retries): - response = delay_request.get(endpoint, query_params, headers=headers) - try: - response_json = response.json() - if "items" in response_json.keys(): - items = response_json.get("items") - break - except (AttributeError, JSONDecodeError, ValueError, TypeError) as e: - logger.error(f"errored due to {e}") - return items - - -def _handle_items_data( - items, - landing_page_base=LANDING_PAGE_BASE_URL, -): - image_count = 0 - for item in items: - images = _get_images(item) - if len(images) == 0: - continue - rights = item.get("rights") - license_, version = _get_license_info(rights) - if license_ is None and version is None: - continue - object_id = item.get("object_number") - if object_id is None: - continue - foreign_landing_url = landing_page_base + object_id - production = item.get("production") - creator = _get_creator(production) - titles = item.get("titles") - title = _get_title(titles) - meta_data = _get_metadata(item) - for img in images: - license_info = get_license_info(license_=license_, license_version=version) - image_count = image_store.add_item( - foreign_identifier=img.get("id"), - foreign_landing_url=foreign_landing_url, - image_url=img.get("image_url"), - height=img.get("height"), - width=img.get("width"), - filesize=img.get("filesize"), - license_info=license_info, - creator=creator, - title=title, - meta_data=meta_data, - ) - return image_count - - -def _get_images(item): - images = [] - - # Legacy images do not have an iiif_id; fall back to the ID from the - # collection DB. - iiif_id = item.get("image_iiif_id") - id = iiif_id or item.get("id") - - if id is not None: - if iiif_id is None: - # Legacy images do not have IIIF links. - image_url = item.get("image_native") - else: - image_url = _get_image_url(iiif_id) - - height = item.get("image_height") - width = item.get("image_width") - filesize = item.get("image_size") or item.get("size") - images.append( - { - "id": id, - "image_url": image_url, - "height": height, - "width": width, - "filesize": filesize, - } - ) - - alternative_images = item.get("alternative_images") - if type(alternative_images) == list: - for alt_img in alternative_images: - if type(alt_img) == dict: - iiif_id = alt_img.get("iiif_id") - if iiif_id is None: - # The API for alternative images does not include the - # 'id', so we must skip if `iiif_id` is not present. - continue - image_url = _get_image_url(iiif_id) - height = alt_img.get("height") - width = alt_img.get("width") - filesize = alt_img.get("image_size") or alt_img.get("size") - - images.append( - { - "id": iiif_id, - "image_url": image_url, - "height": height, - "width": width, - "filesize": filesize, - } - ) - return images - - -def _get_image_url(image_iiif_id, image_size=IMAGE_SIZE): - # For high quality IIIF-enabled images, restrict the image size to prevent loading - # very large files. - image_url = image_iiif_id + f"/full/!{image_size},/0/default.jpg" - - return image_url - - -def _get_license_info(rights): - license_, version = None, None - if type(rights) == str: - if "creativecommons" in rights: - license_, version = "cc0", "1.0" - - return license_, version - - -def _get_creator(production): - creator = None - if type(production) == list: - if type(production[0]) == dict: - creator = production[0].get("creator") - return creator - - -def _get_title(titles): - title = None - if type(titles) == list: - if type(titles[0]) == dict: - title = titles[0].get("title") - return title - - -def _get_metadata(item): - meta_data = {} - created_date = item.get("created") - if created_date: - meta_data["created_date"] = created_date - collection = item.get("collection") - if type(collection) == list: - meta_data["collection"] = ",".join(collection) - techniques = item.get("techniques") - if type(techniques) == list: - meta_data["techniques"] = ",".join(techniques) - colors = item.get("colors") - if type(colors) == list: - meta_data["colors"] = ",".join(colors) - return meta_data + logger.info("Begin: SMK provider script") + ingester = SmkDataIngester() + ingester.ingest_records() if __name__ == "__main__": diff --git a/openverse_catalog/dags/providers/provider_workflows.py b/openverse_catalog/dags/providers/provider_workflows.py index 2cf114d9f..9542b514c 100644 --- a/openverse_catalog/dags/providers/provider_workflows.py +++ b/openverse_catalog/dags/providers/provider_workflows.py @@ -11,6 +11,7 @@ from providers.provider_api_scripts.museum_victoria import VictoriaDataIngester from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester from providers.provider_api_scripts.science_museum import ScienceMuseumDataIngester +from providers.provider_api_scripts.smk import SmkDataIngester from providers.provider_api_scripts.stocksnap import StockSnapDataIngester from providers.provider_api_scripts.wikimedia_commons import ( WikimediaCommonsDataIngester, @@ -195,6 +196,7 @@ def __post_init__(self): ), ProviderWorkflow( provider_script="smk", + ingestion_callable=SmkDataIngester, start_date=datetime(2020, 1, 1), ), ProviderWorkflow( diff --git a/tests/dags/providers/provider_api_scripts/resources/smk/item_with_alternative_images.json b/tests/dags/providers/provider_api_scripts/resources/smk/item_with_alternative_images.json new file mode 100644 index 000000000..d2bbc3c2a --- /dev/null +++ b/tests/dags/providers/provider_api_scripts/resources/smk/item_with_alternative_images.json @@ -0,0 +1,188 @@ +{ + "acquisition_date": "1787-01-01T00:00:00Z", + "acquisition_date_precision": "1887-12-31", + "alternative_images": [ + { + "height": 3524, + "iiif_id": "https://iip.smk.dk/iiif/jp2/1c18df916_kksgb7575_001.tif.jp2", + "iiif_info": "https://iip.smk.dk/iiif/jp2/1c18df916_kksgb7575_001.tif.jp2/info.json", + "mime_type": "image/tiff", + "native": "https://iip.smk.dk/iiif/jp2/1c18df916_kksgb7575_001.tif.jp2/full/full/0/native.jpg", + "orientation": "portrait", + "size": 11347216.551724138, + "thumbnail": "https://iip-thumb.smk.dk/iiif/jp2/1c18df916_kksgb7575_001.tif.jp2/full/!1024,/0/default.jpg", + "width": 3110 + }, + { + "cropped": [ + "true" + ], + "height": 3142, + "iiif_id": "https://iip.smk.dk/iiif/jp2/p8418r38n_kksgb7575_-_001.tif.reconstructed.tif.jp2", + "iiif_info": "https://iip.smk.dk/iiif/jp2/p8418r38n_kksgb7575_-_001.tif.reconstructed.tif.jp2/info.json", + "mime_type": "image/tiff", + "native": "https://iip.smk.dk/iiif/jp2/p8418r38n_kksgb7575_-_001.tif.reconstructed.tif.jp2/full/full/0/native.jpg", + "orientation": "portrait", + "size": 7452243.448275862, + "thumbnail": "https://iip-thumb.smk.dk/iiif/jp2/p8418r38n_kksgb7575_-_001.tif.reconstructed.tif.jp2/full/!1024,/0/default.jpg", + "width": 2302 + } + ], + "artist": [ + "Gheyn, Jacques II de" + ], + "brightness": 6.35253, + "collection": [ + "Gammel bestand" + ], + "colors": [ + "#888888", + "#ffe7c9", + "#bed69f", + "#382510", + "#777777" + ], + "colortemp": 3.8521814, + "content_person": [ + "Brahe, Tycho" + ], + "content_person_full": [ + { + "forename": "Tycho", + "full_name": "Tycho Brahe", + "gender": "UNKNOWN", + "name": "Brahe, Tycho", + "surname": "Brahe" + } + ], + "contrast": 4.1983867, + "created": "2020-03-21T08:32:04Z", + "dimensions": [ + { + "notes": "186 x 134 mm", + "part": "bladmaal", + "type": "h\u00f8jde", + "unit": "milimeter", + "value": "187" + }, + { + "notes": "186 x 134 mm", + "part": "bladmaal", + "type": "bredde", + "unit": "milimeter", + "value": "142" + } + ], + "documentation": [ + { + "author": "Jan Piet Filedt Kok", + "notes": "238 II", + "shelfmark": "k2000-158", + "title": "The New Hollstein Dutch and Flemish etchings, engravings and woodcuts 1450-1700, The de Gheyn family, part I-II", + "year_of_publication": "2000" + }, + { + "author": "Poul Grinder-Hansen", + "notes": "omt. og afb. p. 30 fig. 6", + "shelfmark": "C 35560", + "title": "Tycho Brahes verden: Danmark i Europa 1550-1600", + "year_of_publication": "2006" + } + ], + "enrichment_url": "https://enrichment.api.smk.dk/api/enrichment/KKSgb7575", + "entropy": 9.792481, + "frontend_url": "https://open.smk.dk/artwork/image/kksgb7575", + "has_3d_file": false, + "has_image": true, + "has_text": true, + "id": "1170001143_object", + "iiif_manifest": "https://api.smk.dk/api/v1/iiif/manifest/?id=kksgb7575", + "image_cropped": true, + "image_height": 3026, + "image_hq": true, + "image_iiif_id": "https://iip.smk.dk/iiif/jp2/t722hd324_KKSgb7575_crop.tif.jp2", + "image_iiif_info": "https://iip.smk.dk/iiif/jp2/t722hd324_KKSgb7575_crop.tif.jp2/info.json", + "image_mime_type": "image/tiff", + "image_native": "https://api.smk.dk/api/v1/download/W3siaW1nX3VybCI6Imh0dHBzOi8vaWlwLnNtay5kay9paWlmL2pwMi90NzIyaGQzMjRfS0tTZ2I3NTc1X2Nyb3AudGlmLmpwMi9mdWxsL2Z1bGwvMC9uYXRpdmUuanBnIiwicHVibGljX2RvbWFpbiI6dHJ1ZSwib2JqZWN0X251bWJlciI6IktLU2diNzU3NSIsIm51bSI6IiJ9XQ==/KKSgb7575.jpg", + "image_orientation": "portrait", + "image_size": 6706084, + "image_thumbnail": "https://iip-thumb.smk.dk/iiif/jp2/t722hd324_KKSgb7575_crop.tif.jp2/full/!1024,/0/default.jpg", + "image_width": 2183, + "inscriptions": [ + { + "content": "\"Effigies Tychonis Brahe ottonidis dani/dni de knudstrup et arcis uranienburg in/insula hellisponti danici hvenna fundatoris/instrumentorumos astronomicorum in eadem/dispositarim inventoris et structoris/aetatis su\u00e6 anno 40.anno dni 1586.compl/\"", + "description": "Kobberstik", + "language": "Latin", + "type": "Tekst" + }, + { + "content": "Grafikers og udgivers navn anf\u00f8rt i trykpladen", + "type": "Signatur" + } + ], + "modified": "2022-06-30T07:24:58Z", + "number_of_parts": 1, + "object_names": [ + { + "name": "Kobberstik" + } + ], + "object_number": "KKSgb7575", + "object_url": "https://api.smk.dk/api/v1/art/?object_number=kksgb7575", + "on_display": false, + "production": [ + { + "creator": "Gheyn, Jacques II de", + "creator_date_of_birth": "1565-01-01T00:00:00.000Z", + "creator_date_of_death": "1629-01-01T00:00:00.000Z", + "creator_gender": "MALE", + "creator_lref": "22711_person", + "creator_nationality": "Flamsk" + }, + { + "creator": "Sadeler, Marcus", + "creator_date_of_birth": "1614-04-01T00:00:00.000Z", + "creator_date_of_death": "1650-01-01T00:00:00.000Z", + "creator_forename": "Marcus", + "creator_gender": "UNKNOWN", + "creator_lref": "30171_person", + "creator_nationality": "Tysk", + "creator_role": "Udgiver", + "creator_surname": "Sadeler" + } + ], + "production_date": [ + { + "end": "1594-12-31T00:00:00.000Z", + "period": "1593-1594", + "start": "1593-01-01T00:00:00.000Z" + } + ], + "production_dates_notes": [ + "V\u00e6rkdatering: ca. 1595" + ], + "public_domain": true, + "related_objects": [ + { + "reference": "KKS4530", + "title": "Tycho Brahe" + } + ], + "responsible_department": "Den Kongelige Kobberstiksamling", + "rights": "https://creativecommons.org/publicdomain/zero/1.0/", + "saturation": 2.495402, + "similar_images_url": "https://similar.api.smk.dk/similar/?object_number=KKSgb7575", + "suggested_bg_color": [ + "#888888" + ], + "techniques": [ + "Kobberstik" + ], + "titles": [ + { + "language": "dansk", + "title": "Tycho Brahe", + "type": "museumstitel" + } + ] +} diff --git a/tests/dags/providers/provider_api_scripts/resources/smk/items_batch.json b/tests/dags/providers/provider_api_scripts/resources/smk/items_batch.json deleted file mode 100644 index ee49b2286..000000000 --- a/tests/dags/providers/provider_api_scripts/resources/smk/items_batch.json +++ /dev/null @@ -1,106 +0,0 @@ -[ - { - "acquisition_date_precision": "1893-12-30", - "collection": [ - "Gammel bestand" - ], - "created": "2020-03-21T10:18:17Z", - "dimensions": [ - { - "notes": "256 x 204 mm", - "part": "bladmaal", - "type": "hojde", - "unit": "mm", - "value": "64" - }, - { - "notes": "256 x 204 mm", - "part": "bladmaal", - "type": "bredde", - "unit": "mm", - "value": "55" - } - ], - "distinguishing_features": [ - "Indhold / Kollation: Tegning i to dele fra episode mellem den engelske og hollandske fl\u00e5de - fandt sted d. 3. el. 10. august 1665" - ], - "documentation": [ - { - "author": "F.W.H. Hollstein", - "notes": "e.15", - "shelfmark": "22473", - "title": "Hollstein's German engravings, etchings and woodcuts 1450-1700 (vol. III), Hans Sebald Beham" - }, - { - "author": "Robert A. Koch", - "notes": "14", - "shelfmark": "C 36188", - "title": "The Illustrated Bartsch 14, Early German Masters, Albrecht Altdorfer, Monogrammists: Albrecht Altdorfer, Monogrammists" - } - ], - "enrichment_url": "https://enrichment.api.smk.dk/api/enrichment/KKS1615", - "has_image": true, - "id": "1170000264_object", - "iiif_manifest": "https://api.smk.dk/api/v1/iiif/manifest/?id=kks1615", - "image_height": 5141, - "image_iiif_id": "https://iip.smk.dk/iiif/jp2/kks1615.tif.jp2", - "image_iiif_info": "https://iip.smk.dk/iiif/jp2/kks1615.tif.jp2/info.json", - "image_mime_type": "image/tiff", - "image_native": "https://api.smk.dk/api/v1/download/W3siaW1nX3VybCI6Imh0dHBzOi8vaWlwLnNtay5kay9paWlmL2pwMi9ra3MxNjE1LnRpZi5qcDIvZnVsbC9mdWxsLzAvbmF0aXZlLmpwZyIsInB1YmxpY19kb21haW4iOnRydWUsIm9iamVjdF9udW1iZXIiOiJLS1MxNjE1IiwibnVtIjoiIn1d/KKS1615.jpg", - "image_orientation": "portrait", - "image_size": 47466428, - "image_thumbnail": "https://iip-thumb.smk.dk/iiif/jp2/kks1615.tif.jp2/full/!1024,/0/default.jpg", - "image_width": 3076, - "inscriptions": [ - { - "content": "Grafikers navn anf\u00f8rt i trykpladen ", - "language": "dansk" - } - ], - "modified": "2020-03-22T23:31:17Z", - "object_names": [ - { - "name": "kobberstik" - } - ], - "object_number": "KKS1615", - "object_url": "https://api.smk.dk/api/v1/art/?object_number=kks1615", - "on_display": false, - "production": [ - { - "creator": "Altdorfer, Albrecht", - "creator_date_of_birth": "1478-01-01T00:00:00.000Z", - "creator_date_of_death": "1536-01-01T00:00:00.000Z", - "creator_forename": "Albrecht", - "creator_gender": "MALE", - "creator_lref": "1180000107_person", - "creator_nationality": "tysk", - "creator_surname": "Altdorfer" - } - ], - "production_date": [ - { - "end": "1537-11-29T00:00:00.000Z", - "period": "1479-1537", - "start": "1479-11-29T00:00:00.000Z" - } - ], - "production_dates_notes": [ - "verk datering: 1480 - 1538", - "Datering f\u00f8lger Altdorfers leve\u00e5r, da v\u00e6rket er udateret" - ], - "public_domain": true, - "responsible_department": "Samling og Forskning (KKS)", - "rights": "https://creativecommons.org/share-your-work/public-domain/cc0/", - "similar_images_url": "https://similar.api.smk.dk/similar/?object_number=KKS1615", - "techniques": [ - "Kobberstik" - ], - "titles": [ - { - "title": "Jomfru Maria med barnet og Sankt Anne ved vuggen", - "type": "DESCRIPT" - } - ] - } -] diff --git a/tests/dags/providers/provider_api_scripts/resources/smk/response_failure.json b/tests/dags/providers/provider_api_scripts/resources/smk/response_failure.json deleted file mode 100644 index 6439ace30..000000000 --- a/tests/dags/providers/provider_api_scripts/resources/smk/response_failure.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "autocomplete": [], - "facets": {}, - "facets_ranges": {}, - "found": 32732, - "offset": 40000, - "rows": 1 -} diff --git a/tests/dags/providers/provider_api_scripts/resources/smk/response_success.json b/tests/dags/providers/provider_api_scripts/resources/smk/response_success.json deleted file mode 100644 index a65cc6542..000000000 --- a/tests/dags/providers/provider_api_scripts/resources/smk/response_success.json +++ /dev/null @@ -1,114 +0,0 @@ -{ - "autocomplete": [], - "facets": {}, - "facets_ranges": {}, - "found": 32732, - "items": [ - { - "acquisition_date_precision": "1893-12-30", - "collection": [ - "Gammel bestand" - ], - "created": "2020-03-21T10:18:17Z", - "dimensions": [ - { - "notes": "256 x 204 mm", - "part": "bladmaal", - "type": "hojde", - "unit": "mm", - "value": "64" - }, - { - "notes": "256 x 204 mm", - "part": "bladmaal", - "type": "bredde", - "unit": "mm", - "value": "55" - } - ], - "distinguishing_features": [ - "Indhold / Kollation: Tegning i to dele fra episode mellem den engelske og hollandske fl\u00e5de - fandt sted d. 3. el. 10. august 1665" - ], - "documentation": [ - { - "author": "F.W.H. Hollstein", - "notes": "e.15", - "shelfmark": "22473", - "title": "Hollstein's German engravings, etchings and woodcuts 1450-1700 (vol. III), Hans Sebald Beham" - }, - { - "author": "Robert A. Koch", - "notes": "14", - "shelfmark": "C 36188", - "title": "The Illustrated Bartsch 14, Early German Masters, Albrecht Altdorfer, Monogrammists: Albrecht Altdorfer, Monogrammists" - } - ], - "enrichment_url": "https://enrichment.api.smk.dk/api/enrichment/KKS1615", - "has_image": true, - "id": "1170000264_object", - "iiif_manifest": "https://api.smk.dk/api/v1/iiif/manifest/?id=kks1615", - "image_height": 5141, - "image_iiif_id": "https://iip.smk.dk/iiif/jp2/kks1615.tif.jp2", - "image_iiif_info": "https://iip.smk.dk/iiif/jp2/kks1615.tif.jp2/info.json", - "image_mime_type": "image/tiff", - "image_native": "https://api.smk.dk/api/v1/download/W3siaW1nX3VybCI6Imh0dHBzOi8vaWlwLnNtay5kay9paWlmL2pwMi9ra3MxNjE1LnRpZi5qcDIvZnVsbC9mdWxsLzAvbmF0aXZlLmpwZyIsInB1YmxpY19kb21haW4iOnRydWUsIm9iamVjdF9udW1iZXIiOiJLS1MxNjE1IiwibnVtIjoiIn1d/KKS1615.jpg", - "image_orientation": "portrait", - "image_size": 47466428, - "image_thumbnail": "https://iip-thumb.smk.dk/iiif/jp2/kks1615.tif.jp2/full/!1024,/0/default.jpg", - "image_width": 3076, - "inscriptions": [ - { - "content": "Grafikers navn anf\u00f8rt i trykpladen ", - "language": "dansk" - } - ], - "modified": "2020-03-22T23:31:17Z", - "object_names": [ - { - "name": "kobberstik" - } - ], - "object_number": "KKS1615", - "object_url": "https://api.smk.dk/api/v1/art/?object_number=kks1615", - "on_display": false, - "production": [ - { - "creator": "Altdorfer, Albrecht", - "creator_date_of_birth": "1478-01-01T00:00:00.000Z", - "creator_date_of_death": "1536-01-01T00:00:00.000Z", - "creator_forename": "Albrecht", - "creator_gender": "MALE", - "creator_lref": "1180000107_person", - "creator_nationality": "tysk", - "creator_surname": "Altdorfer" - } - ], - "production_date": [ - { - "end": "1537-11-29T00:00:00.000Z", - "period": "1479-1537", - "start": "1479-11-29T00:00:00.000Z" - } - ], - "production_dates_notes": [ - "verk datering: 1480 - 1538", - "Datering f\u00f8lger Altdorfers leve\u00e5r, da v\u00e6rket er udateret" - ], - "public_domain": true, - "responsible_department": "Samling og Forskning (KKS)", - "rights": "https://creativecommons.org/share-your-work/public-domain/cc0/", - "similar_images_url": "https://similar.api.smk.dk/similar/?object_number=KKS1615", - "techniques": [ - "Kobberstik" - ], - "titles": [ - { - "title": "Jomfru Maria med barnet og Sankt Anne ved vuggen", - "type": "DESCRIPT" - } - ] - } - ], - "offset": 0, - "rows": 1 -} diff --git a/tests/dags/providers/provider_api_scripts/test_smk.py b/tests/dags/providers/provider_api_scripts/test_smk.py index a26081796..d1c9a6cb3 100644 --- a/tests/dags/providers/provider_api_scripts/test_smk.py +++ b/tests/dags/providers/provider_api_scripts/test_smk.py @@ -1,18 +1,13 @@ import json -import logging from pathlib import Path -from unittest.mock import MagicMock, patch -import requests from common.licenses import LicenseInfo -from providers.provider_api_scripts import smk +from providers.provider_api_scripts.smk import SmkDataIngester RESOURCES = Path(__file__).parent.resolve() / "resources/smk" -logging.basicConfig( - format="%(asctime)s - %(name)s - %(levelname)s: %(message)s", level=logging.DEBUG -) +smk = SmkDataIngester() CC0 = LicenseInfo( "cc0", @@ -28,8 +23,8 @@ def _get_resource_json(json_name): return resource_json -def test_get_query_param_default(): - actual_param = smk._get_query_param() +def test_get_next_query_params_first_call(): + actual_param = smk.get_next_query_params(prev_query_params=None) expected_param = { "keys": "*", "filters": "[has_image:true],[public_domain:true]", @@ -40,150 +35,68 @@ def test_get_query_param_default(): assert actual_param == expected_param -def test_get_query_param_offset(): - actual_param = smk._get_query_param(offset=100) +def test_get_next_query_params_increments_offset(): + actual_param = smk.get_next_query_params( + { + "keys": "*", + "filters": "[has_image:true],[public_domain:true]", + "offset": 0, + "rows": 2000, + } + ) expected_param = { "keys": "*", "filters": "[has_image:true],[public_domain:true]", - "offset": 100, + "offset": 2000, "rows": 2000, } assert actual_param == expected_param -def test_get_batch_items_success(): - query_param = { - "keys": "*", - "filters": "[has_image:true],[public_domain:true]", - "offset": 0, - "rows": 1, - } - response = _get_resource_json("response_success.json") - r = requests.Response() - r.status_code = 200 - r.json = MagicMock(return_value=response) - with patch.object(smk.delay_request, "get", return_value=r) as mock_call: - actual_response = smk._get_batch_items(query_params=query_param) +def test__get_foreign_landing_url(): + item = {"object_number": "KKSgb22423"} + actual_url = smk._get_foreign_landing_url(item) + expected_url = "https://open.smk.dk/en/artwork/image/KKSgb22423" + assert actual_url == expected_url - expected_response = response.get("items") - assert mock_call.call_count == 1 - assert actual_response == expected_response +def test__get_image_url(): + image_iiif_id = "https://iip.smk.dk/iiif/jp2/1z40kx99j_kksgb22423.tif.jp2" + actual_url = smk._get_image_url(image_iiif_id) + expected_url = "https://iip.smk.dk/iiif/jp2/1z40kx99j_kksgb22423.tif.jp2/full/!2048,/0/default.jpg" + assert actual_url == expected_url -def test_get_batch_item_failure1(): - query_param = { - "keys": "*", - "filters": "[has_image:true],[public_domain:true]", - "offset": 40000, - "rows": 2000, - } - response = _get_resource_json("response_failure.json") - r = requests.Response() - r.status_code = 200 - r.json = MagicMock(return_value=response) - with patch.object(smk.delay_request, "get", return_value=r) as mock_call: - actual_response = smk._get_batch_items(query_params=query_param) - - assert mock_call.call_count == 3 - assert actual_response is None - - -def test_get_batch_item_failure2(): - query_param = { - "keys": "*", - "filters": "[has_image:true],[public_domain:true]", - "offset": 0, - "rows": 2000, - } - response = None - with patch.object(smk.delay_request, "get", return_value=response) as mock_call: - actual_response = smk._get_batch_items(query_params=query_param) - - assert mock_call.call_count == 3 - assert actual_response is None - - -def test_handle_items_data_success(): - items = _get_resource_json("items_batch.json") - with patch.object(smk.image_store, "add_item", return_value=1) as mock_add_item: - actual_image_count = smk._handle_items_data(items) - - assert mock_add_item.call_count == 1 - assert actual_image_count == 1 - - -def test_handle_items_data_success_data(): - items = _get_resource_json("items_batch.json") - with patch.object(smk.image_store, "save_item") as mock_save_item: - smk._handle_items_data(items) - - args, kwargs = mock_save_item.call_args - expected_image = { - "foreign_identifier": "https://iip.smk.dk/iiif/jp2/kks1615.tif.jp2", - "foreign_landing_url": "https://open.smk.dk/en/artwork/image/KKS1615", - "url": "https://iip.smk.dk/iiif/jp2/kks1615.tif.jp2/full/!2048,/0/default.jpg", - "height": 5141, - "width": 3076, - "filesize": 47466428, - "filetype": "jpg", - "license_version": CC0.version, - "license_": CC0.license, - "creator": "Altdorfer, Albrecht", - "title": "Jomfru Maria med barnet og Sankt Anne ved vuggen", - "meta_data": { - "created_date": "2020-03-21T10:18:17Z", - "collection": "Gammel bestand", - "techniques": "Kobberstik", - "license_url": CC0.url, - "raw_license_url": None, - }, - } - actual_image = args[0] - for key, value in expected_image.items(): - assert getattr(actual_image, key) == expected_image[key] - - -def test_filesize_set_to_none_when_none_given(): - items = _get_resource_json("items_batch.json") - items[0].pop("image_size", None) - with patch.object(smk.image_store, "save_item") as mock_save_item: - smk._handle_items_data(items) - - args, kwargs = mock_save_item.call_args - actual_image = args[0] - assert actual_image.filesize is None - +def test__get_title(): + item = {"titles": [{"title": "sample"}]} + actual_title = smk._get_title(item) + assert actual_title == "sample" -def test_handle_items_data_failure(): - items = [] - with patch.object(smk.image_store, "add_item", return_value=None) as mock_add_item: - actual_image_count = smk._handle_items_data(items) - assert mock_add_item.call_count == 0 - assert actual_image_count == 0 +def test__get_title_none(): + item = {"id": "123_object"} + actual_title = smk._get_title(item) + assert actual_title is None -def test_get_image_high_quality(): +def test__get_images_high_quality(): item = _get_resource_json("image_data_hq.json") expected_images_data = _get_resource_json("expected_image_data_hq.json") - actual_images_data = smk._get_images(item) assert actual_images_data == expected_images_data -def test_get_image_legacy(): +def test__get_images_legacy(): item = _get_resource_json("image_data_legacy.json") expected_images_data = _get_resource_json("expected_image_data_legacy.json") - actual_images_data = smk._get_images(item) assert actual_images_data == expected_images_data -def test_get_image_partial(): +def test__get_images_partial(): item = _get_resource_json("image_data_partial.json") expected_images_data = _get_resource_json("expected_image_data_partial.json") @@ -192,70 +105,7 @@ def test_get_image_partial(): assert actual_images_data == expected_images_data -def test_get_image_none(): - item = {} - expected_images_data = [] - actual_images_data = smk._get_images(item) - - assert actual_images_data == expected_images_data - - -def test_get_image_urls(): - image_iif_id = "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2" - actual_image_url = smk._get_image_url(image_iif_id) - - expected_image_url = ( - "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!2048,/0/default.jpg" - ) - - assert actual_image_url == expected_image_url - - -def test_get_license_info_success(): - rights = "https://creativecommons.org/share-your-work/public-domain/cc0/" - actual_license_, actual_version = smk._get_license_info(rights) - - assert actual_license_ == "cc0" - assert actual_version == "1.0" - - -def test_get_license_info_failure(): - rights = None - actual_license_, actual_version = smk._get_license_info(rights) - - assert actual_version is None - assert actual_license_ is None - - -def test_get_creator(): - production = [{"creator": "sample"}] - actual_creator = smk._get_creator(production) - - assert actual_creator == "sample" - - -def test_get_creator_none(): - production = {} - actual_creator = smk._get_creator(production) - - assert actual_creator is None - - -def test_get_title(): - titles = [{"title": "sample"}] - actual_title = smk._get_title(titles) - - assert actual_title == "sample" - - -def test_get_title_none(): - titles = None - actual_title = smk._get_title(titles) - - assert actual_title is None - - -def test_get_metadata(): +def test__get_metadata(): item = _get_resource_json("item.json") actual_metadata = smk._get_metadata(item) @@ -264,5 +114,18 @@ def test_get_metadata(): "collection": "Gammel bestand", "techniques": "Kobberstik", } - assert actual_metadata == expected_metadata + + +def test_get_record_data_returns_main_image(): + item = _get_resource_json("item.json") + images = smk.get_record_data(item) + + assert len(images) == 1 + + +def test_get_record_data_returns_alternative_images(): + item = _get_resource_json("item_with_alternative_images.json") + images = smk.get_record_data(item) + + assert len(images) == 3