Skip to content

Commit

Permalink
Remove alternate image extraction from SMK, fix foreign landing URL (#…
Browse files Browse the repository at this point in the history
…1003)

* Remove alternative images logic from SMK

* Correctly encode foreign landing urls

* Set the watermarked value to True temporarily for SMK
  • Loading branch information
AetherUnbound authored Mar 15, 2023
1 parent b173d6e commit 53b3da1
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 233 deletions.
39 changes: 13 additions & 26 deletions openverse_catalog/dags/providers/provider_api_scripts/smk.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
Notes: https://www.smk.dk/en/article/smk-api/
"""
import logging
import urllib.parse

from common import constants
from common.licenses import get_license_info
Expand Down Expand Up @@ -56,7 +57,9 @@ def _get_foreign_landing_url(item) -> str | None:
"foreign_landing_url."
)
return
return f"https://open.smk.dk/en/artwork/image/{object_num}"
# Occasionally, the object number will have a space in it. for these cases we
# need to urlencode it.
return f"https://open.smk.dk/en/artwork/image/{urllib.parse.quote(object_num)}"

@staticmethod
def _get_image_url(image_iiif_id: str, image_size=2048):
Expand Down Expand Up @@ -113,31 +116,11 @@ def _get_images(item: dict) -> list:
}
)

alternative_images = item.get("alternative_images")
if type(alternative_images) == list:
for alt_img in alternative_images:
if type(alt_img) == dict:
iiif_id = alt_img.get("iiif_id")
if iiif_id is None:
# The API for alternative images does not include the
# 'id', so we must skip if `iiif_id` is not present.
continue
image_url = SmkDataIngester._get_image_url(iiif_id)
thumbnail_url = alt_img.get("thumbnail")
height = alt_img.get("height")
width = alt_img.get("width")
filesize = alt_img.get("image_size") or alt_img.get("size")

images.append(
{
"id": iiif_id,
"image_url": image_url,
"thumbnail_url": thumbnail_url,
"height": height,
"width": width,
"filesize": filesize,
}
)
# We used to get additional images from the `alternative_images` field,
# but we found these to be either redundant, duplicates, or lower quality
# versions. See https://github.com/WordPress/openverse-catalog/issues/875
# for the full investigation & discussion

return images

@staticmethod
Expand Down Expand Up @@ -176,6 +159,10 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None:
"width": img.get("width"),
"filesize": img.get("filesize"),
"meta_data": self._get_metadata(data),
# FIXME: USED AS A SENTINEL FOR WHICH FILES TO DELETE AFTER A FULL
# RUN. THIS SHOULD BE REMOVED AS SOON AS A RUN IS COMPLETE AND THE
# FILES MARKED AS WATERMARKED=False ARE DELETED.
"watermarked": True,
}
)
return images
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,5 @@
"image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.reconstructed.tif.jp2/full/!2048,/0/default.jpg",
"thumbnail_url": "https://iip-thumb.smk.dk/iiif/jp2/2227ms627_KKSgb6458.tif.reconstructed.tif.jp2/full/!1024,/0/default.jpg",
"width": 3887
},
{
"filesize": 19269857,
"height": 1576,
"id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2",
"image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!2048,/0/default.jpg",
"thumbnail_url": "https://iip-thumb.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!1024,/0/default.jpg",
"width": 4073
}
]
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,5 @@
"image_url": "https://api.smk.dk/api/v1/thumbnail/52f00edc-936e-42a7-950b-d0cd0df3864b.jpg",
"thumbnail_url": "https://api.smk.dk/api/v1/thumbnail/52f00edc-936e-42a7-950b-d0cd0df3864b.jpg",
"width": 3887
},
{
"filesize": 19269857,
"height": 1576,
"id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2",
"image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!2048,/0/default.jpg",
"thumbnail_url": "https://iip-thumb.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!1024,/0/default.jpg",
"width": 4073
}
]

This file was deleted.

This file was deleted.

30 changes: 17 additions & 13 deletions tests/dags/providers/provider_api_scripts/test_smk.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
from pathlib import Path

import pytest
from common.licenses import LicenseInfo
from providers.provider_api_scripts.smk import SmkDataIngester

Expand Down Expand Up @@ -57,10 +58,20 @@ def test_get_next_query_params_increments_offset():
assert actual_param == expected_param


def test__get_foreign_landing_url():
item = {"object_number": "KKSgb22423"}
@pytest.mark.parametrize(
"object_number, expected_url",
[
("KKSgb22423", "https://open.smk.dk/en/artwork/image/KKSgb22423"),
(
"KKSgb22423 version 1",
"https://open.smk.dk/en/artwork/image/KKSgb22423%20version%201",
),
("KSMB 25 106.5", "https://open.smk.dk/en/artwork/image/KSMB%2025%20106.5"),
],
)
def test__get_foreign_landing_url(object_number, expected_url):
item = {"object_number": object_number}
actual_url = smk._get_foreign_landing_url(item)
expected_url = "https://open.smk.dk/en/artwork/image/KKSgb22423"
assert actual_url == expected_url


Expand Down Expand Up @@ -100,12 +111,12 @@ def test__get_images_legacy():


def test__get_images_partial():
# Left in as a regression test for
# https://github.com/WordPress/openverse-catalog/issues/875
item = _get_resource_json("image_data_partial.json")
expected_images_data = _get_resource_json("expected_image_data_partial.json")

actual_images_data = smk._get_images(item)

assert actual_images_data == expected_images_data
assert actual_images_data == []


def test__get_metadata():
Expand All @@ -125,10 +136,3 @@ def test_get_record_data_returns_main_image():
images = smk.get_record_data(item)

assert len(images) == 1


def test_get_record_data_returns_alternative_images():
item = _get_resource_json("item_with_alternative_images.json")
images = smk.get_record_data(item)

assert len(images) == 3

0 comments on commit 53b3da1

Please sign in to comment.