Skip to content

Commit

Permalink
Ensure SMK images don't timeout on validation (#506)
Browse files Browse the repository at this point in the history
* Rename Staten Museum to SMK

* Request smaller images

* Add support for legacy (non IIIF-enabled) images

* Update and add new tests

* Update provider name in DB
  • Loading branch information
stacimc authored May 20, 2022
1 parent 7dbff46 commit bba0413
Show file tree
Hide file tree
Showing 16 changed files with 121 additions and 57 deletions.
2 changes: 1 addition & 1 deletion openverse_catalog/dags/common/loader/provider_details.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
NYPL_DEFAULT_PROVIDER = "nypl"
RAWPIXEL_DEFAULT_PROVIDER = "rawpixel"
SCIENCE_DEFAULT_PROVIDER = "sciencemuseum"
STATENS_DEFAULT_PROVIDER = "statensmuseum"
SMK_DEFAULT_PROVIDER = "smk"
WALTERS_DEFAULT_PROVIDER = "waltersartmuseum"
FINNISH_DEFAULT_PROVIDER = "finnishmuseums"
JAMENDO_DEFAULT_PROVIDER = "jamendo"
Expand Down
2 changes: 1 addition & 1 deletion openverse_catalog/dags/common/loader/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
prov.NYPL_DEFAULT_PROVIDER: "1 month 3 days",
prov.RAWPIXEL_DEFAULT_PROVIDER: "1 month 3 days",
prov.SCIENCE_DEFAULT_PROVIDER: "1 month 3 days",
prov.STATENS_DEFAULT_PROVIDER: "1 month 3 days",
prov.SMK_DEFAULT_PROVIDER: "1 month 3 days",
}

DB_COLUMNS = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@
LIMIT = 2000
DELAY = 5
RETRIES = 3
PROVIDER = prov.STATENS_DEFAULT_PROVIDER
PROVIDER = prov.SMK_DEFAULT_PROVIDER
ENDPOINT = "https://api.smk.dk/api/v1/art/search/"
LANDING_PAGE_BASE_URL = "https://open.smk.dk/en/artwork/image/"
IMAGE_SIZE = "max"
IMAGE_SIZE = 2048
THUMBNAIL_SIZE = 400

delay_request = DelayedRequester(delay=DELAY)
Expand Down Expand Up @@ -102,7 +102,7 @@ def _handle_items_data(
for img in images:
license_info = get_license_info(license_=license_, license_version=version)
image_count = image_store.add_item(
foreign_identifier=img.get("iiif_id"),
foreign_identifier=img.get("id"),
foreign_landing_url=foreign_landing_url,
image_url=img.get("image_url"),
height=img.get("height"),
Expand All @@ -118,14 +118,26 @@ def _handle_items_data(

def _get_images(item):
images = []
if item.get("image_iiif_id") is not None:
iiif_id = item.get("image_iiif_id")
image_url, thumbnail_url = _get_image_url(iiif_id)

# Legacy images do not have an iiif_id; fall back to the ID from the
# collection DB.
iiif_id = item.get("image_iiif_id")
id = iiif_id or item.get("id")

if id is not None:
if iiif_id is None:
# Legacy images do not have IIIF links.
image_url = item.get("image_native")
thumbnail_url = item.get("image_thumbnail")
else:
image_url, thumbnail_url = _get_image_urls(iiif_id)

height = item.get("image_height")
width = item.get("image_width")

images.append(
{
"iiif_id": iiif_id,
"id": id,
"image_url": image_url,
"thumbnail": thumbnail_url,
"height": height,
Expand All @@ -139,13 +151,15 @@ def _get_images(item):
if type(alt_img) == dict:
iiif_id = alt_img.get("iiif_id")
if iiif_id is None:
# The API for alternative images does not include the
# 'id', so we must skip if `iiif_id` is not present.
continue
image_url, thumbnail_url = _get_image_url(iiif_id)
image_url, thumbnail_url = _get_image_urls(iiif_id)
height = alt_img.get("height")
width = alt_img.get("width")
images.append(
{
"iiif_id": iiif_id,
"id": iiif_id,
"image_url": image_url,
"thumbnail": thumbnail_url,
"height": height,
Expand All @@ -155,8 +169,12 @@ def _get_images(item):
return images


def _get_image_url(image_iiif_id, image_size=IMAGE_SIZE, thumbnail_size=THUMBNAIL_SIZE):
image_url = image_iiif_id + f"/full/{image_size}/0/default.jpg"
def _get_image_urls(
image_iiif_id, image_size=IMAGE_SIZE, thumbnail_size=THUMBNAIL_SIZE
):
# For high quality IIIF-enabled images, restrict the image size to prevent loading
# very large files.
image_url = image_iiif_id + f"/full/!{image_size},/0/default.jpg"
thumbnail_url = image_iiif_id + f"/full/!{thumbnail_size},/0/default.jpg"

return image_url, thumbnail_url
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
"""
This file configures the Apache Airflow DAG to ingest Statens museum data.
This file configures the Apache Airflow DAG to ingest data for SMK, the
National Gallery of Denmark.
We do this by running `provider_api_scripts.staten_museum.main`
We do this by running `provider_api_scripts.smk.main`
"""
import logging

# airflow DAG (necessary for Airflow to find this file)
from datetime import datetime, timedelta

from common.provider_dag_factory import create_provider_api_workflow
from providers.provider_api_scripts import staten_museum
from providers.provider_api_scripts import smk


logging.basicConfig(
Expand All @@ -18,12 +19,12 @@

logger = logging.getLogger(__name__)

DAG_ID = "staten_museum_workflow"
DAG_ID = "smk_workflow"
START_DATE = datetime(2020, 1, 1)

globals()[DAG_ID] = create_provider_api_workflow(
DAG_ID,
staten_museum.main,
smk.main,
start_date=START_DATE,
schedule_string="@monthly",
dated=False,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
[
{
"height": 1059,
"iiif_id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.reconstructed.tif.jp2",
"image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.reconstructed.tif.jp2/full/max/0/default.jpg",
"id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.reconstructed.tif.jp2",
"image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.reconstructed.tif.jp2/full/!2048,/0/default.jpg",
"thumbnail": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.reconstructed.tif.jp2/full/!400,/0/default.jpg",
"width": 3887
},
{
"height": 1576,
"iiif_id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2",
"image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/max/0/default.jpg",
"id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2",
"image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!2048,/0/default.jpg",
"thumbnail": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!400,/0/default.jpg",
"width": 4073
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[
{
"height": 1059,
"id": "1170012466_object",
"image_url": "https://api.smk.dk/api/v1/thumbnail/52f00edc-936e-42a7-950b-d0cd0df3864b.jpg",
"thumbnail": "https://api.smk.dk/api/v1/thumbnail/52f00edc-936e-42a7-950b-d0cd0df3864b.jpg",
"width": 3887
},
{
"height": 1576,
"id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2",
"image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!2048,/0/default.jpg",
"thumbnail": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!400,/0/default.jpg",
"width": 4073
}
]
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[
{
"height": 1576,
"iiif_id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2",
"image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/max/0/default.jpg",
"id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2",
"image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!2048,/0/default.jpg",
"thumbnail": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!400,/0/default.jpg",
"width": 4073
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"width": 4073
}
],
"id": "1170012466_object",
"image_height": 1059,
"image_iiif_id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.reconstructed.tif.jp2",
"image_iiif_info": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.reconstructed.tif.jp2/info.json",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"alternative_images": [
{
"height": 1576,
"iiif_id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2",
"iiif_info": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/info.json",
"mime_type": "image/tiff",
"native": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/full/0/native.jpg",
"orientation": "landscape",
"size": 19269857,
"thumbnail": "https://iip-thumb.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!1024,/0/default.jpg",
"width": 4073
}
],
"id": "1170012466_object",
"image_height": 1059,
"image_native": "https://api.smk.dk/api/v1/thumbnail/52f00edc-936e-42a7-950b-d0cd0df3864b.jpg",
"image_size": 11784886,
"image_thumbnail": "https://api.smk.dk/api/v1/thumbnail/52f00edc-936e-42a7-950b-d0cd0df3864b.jpg",
"image_width": 3887
}
Loading

0 comments on commit bba0413

Please sign in to comment.