Skip to content

Commit

Permalink
Reinstate image thumbnail column (#903)
Browse files Browse the repository at this point in the history
  • Loading branch information
krysal authored Dec 7, 2022
1 parent 6f92f40 commit f3799fc
Show file tree
Hide file tree
Showing 11 changed files with 44 additions and 10 deletions.
15 changes: 14 additions & 1 deletion DAGs.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ The following are DAGs grouped by their primary tag:
| [`rawpixel_workflow`](#rawpixel_workflow) | `@monthly` | `False` | image |
| [`science_museum_workflow`](#science_museum_workflow) | `@monthly` | `False` | image |
| [`smithsonian_workflow`](#smithsonian_workflow) | `@weekly` | `False` | image |
| `smk_workflow` | `@monthly` | `False` | image |
| [`smk_workflow`](#smk_workflow) | `@monthly` | `False` | image |
| [`stocksnap_workflow`](#stocksnap_workflow) | `@monthly` | `False` | image |
| [`wikimedia_commons_workflow`](#wikimedia_commons_workflow) | `@daily` | `True` | image, audio |
| [`wordpress_workflow`](#wordpress_workflow) | `@monthly` | `False` | image |
Expand Down Expand Up @@ -125,6 +125,7 @@ The following is documentation associated with each DAG (where available):
1. [`report_pending_reported_media`](#report_pending_reported_media)
1. [`science_museum_workflow`](#science_museum_workflow)
1. [`smithsonian_workflow`](#smithsonian_workflow)
1. [`smk_workflow`](#smk_workflow)
1. [`stocksnap_workflow`](#stocksnap_workflow)
1. [`wikimedia_commons_workflow`](#wikimedia_commons_workflow)
1. [`wikimedia_reingestion_workflow`](#wikimedia_reingestion_workflow)
Expand Down Expand Up @@ -587,6 +588,18 @@ Output: TSV file containing the images and the respective meta-data.
Notes: https://api.si.edu/openaccess/api/v1.0/search


## `smk_workflow`


Content Provider: Statens Museum for Kunst (National Gallery of Denmark)

ETL Process: Use the API to identify all openly licensed media.

Output: TSV file containing the media metadata.

Notes: https://www.smk.dk/en/article/smk-api/


## `stocksnap_workflow`


Expand Down
10 changes: 4 additions & 6 deletions openverse_catalog/dags/common/storage/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def add_item(
foreign_landing_url: str,
image_url: str,
license_info: LicenseInfo,
thumbnail_url: str | None = None,
filesize: int | None = None,
filetype: str | None = None,
foreign_identifier: str | None = None,
Expand Down Expand Up @@ -122,7 +123,7 @@ def add_item(
image_data = {
"foreign_landing_url": foreign_landing_url,
"image_url": image_url,
"thumbnail_url": None,
"thumbnail_url": thumbnail_url,
"filesize": filesize,
"filetype": filetype,
"license_info": license_info,
Expand All @@ -149,10 +150,6 @@ def _get_image(self, **kwargs) -> Image | None:
image_metadata = self.clean_media_metadata(**kwargs)
if image_metadata is None:
return None
# Set the thumbnail to None to make sure no image provider scripts
# write a value, and to make testing easier by not having to provide
# the value.
image_metadata["thumbnail_url"] = None
# Convert the `image_url` key used in ImageStore, TSV and
# provider API scripts into `url` key used in db
image_metadata["url"] = image_metadata.pop("image_url")
Expand All @@ -176,6 +173,7 @@ class MockImageStore(ImageStore):
"""

NULLABLE_FIELDS = [
"thumbnail_url",
"filesize",
"filetype",
"foreign_identifier",
Expand Down Expand Up @@ -206,7 +204,7 @@ def __init__(
self.media_buffer = []

def add_item(self, **kwargs):
image_data = kwargs | {"thumbnail_url": None}
image_data = kwargs
for field in MockImageStore.NULLABLE_FIELDS:
if field not in image_data:
image_data[field] = None
Expand Down
1 change: 1 addition & 0 deletions openverse_catalog/dags/common/tsv_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def _process_row(tsv_row):
image_store.add_item(
foreign_landing_url=row_image.foreign_landing_url,
image_url=row_image.url,
thumbnail_url=row_image.thumbnail_url,
license_info=get_license_info(
license_url=get_license_url(row_meta_data),
license_=row_image.license_,
Expand Down
16 changes: 14 additions & 2 deletions openverse_catalog/dags/providers/provider_api_scripts/smk.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
"""
Content Provider: Statens Museum for Kunst (National Gallery of Denmark)
ETL Process: Use the API to identify all openly licensed media.
Output: TSV file containing the media metadata.
Notes: https://www.smk.dk/en/article/smk-api/
"""
import logging

from common import constants
Expand Down Expand Up @@ -53,8 +62,6 @@ def _get_foreign_landing_url(item) -> str | None:
def _get_image_url(image_iiif_id: str, image_size=2048):
# For high quality IIIF-enabled images, restrict the image size to prevent
# loading very large files.
# TODO: consider just using the full "image_native" when adding the
# "image_thumbnail".
image_url = f"{image_iiif_id}/full/!{image_size},/0/default.jpg"
return image_url

Expand Down Expand Up @@ -91,13 +98,15 @@ def _get_images(item: dict) -> list:
else:
image_url = SmkDataIngester._get_image_url(iiif_id)

thumbnail_url = item.get("image_thumbnail")
height = item.get("image_height")
width = item.get("image_width")
filesize = item.get("image_size") or item.get("size")
images.append(
{
"id": image_id,
"image_url": image_url,
"thumbnail_url": thumbnail_url,
"height": height,
"width": width,
"filesize": filesize,
Expand All @@ -114,6 +123,7 @@ def _get_images(item: dict) -> list:
# 'id', so we must skip if `iiif_id` is not present.
continue
image_url = SmkDataIngester._get_image_url(iiif_id)
thumbnail_url = alt_img.get("thumbnail")
height = alt_img.get("height")
width = alt_img.get("width")
filesize = alt_img.get("image_size") or alt_img.get("size")
Expand All @@ -122,6 +132,7 @@ def _get_images(item: dict) -> list:
{
"id": iiif_id,
"image_url": image_url,
"thumbnail_url": thumbnail_url,
"height": height,
"width": width,
"filesize": filesize,
Expand Down Expand Up @@ -157,6 +168,7 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None:
"foreign_identifier": img.get("id"),
"foreign_landing_url": self._get_foreign_landing_url(data),
"image_url": img.get("image_url"),
"thumbnail_url": img.get("thumbnail_url"),
"license_info": license_info,
"title": self._get_title(data),
"creator": self._get_creator(data),
Expand Down
1 change: 0 additions & 1 deletion tests/dags/common/storage/test_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,6 @@ def mock_enrich_tags(tags):
args_dict["license_"] = args_dict.get("license_info").license
args_dict["license_version"] = args_dict.pop("license_info").version
args_dict["url"] = args_dict.pop("image_url")
args_dict["thumbnail_url"] = None

assert actual_image == image.Image(**args_dict)

Expand Down
3 changes: 3 additions & 0 deletions tests/dags/common/storage/test_media.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,7 @@ def test_MediaStore_get_image_gets_source(
license_info=BY_LICENSE_INFO,
foreign_landing_url=TEST_FOREIGN_LANDING_URL,
image_url=TEST_IMAGE_URL,
thumbnail_url=None,
filetype=None,
filesize=None,
foreign_identifier=None,
Expand Down Expand Up @@ -350,6 +351,7 @@ def item_saver(arg):
license_info=BY_LICENSE_INFO,
foreign_landing_url="",
image_url="",
thumbnail_url=None,
foreign_identifier=None,
width=None,
height=None,
Expand Down Expand Up @@ -385,6 +387,7 @@ def item_saver(arg):
license_info=LicenseInfo("by", "4.0", valid_license_url, license_url),
foreign_landing_url="",
image_url="",
thumbnail_url=None,
foreign_identifier=None,
width=None,
height=None,
Expand Down
2 changes: 2 additions & 0 deletions tests/dags/common/test_tsv_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def test_clean_tsv_cleans_tsv_rows(tmpdir):
call().add_item(
foreign_landing_url="https://example.com/landing1",
image_url="https://example.com/image1",
thumbnail_url="https://example.com/thumbnail1",
license_info=by_license,
foreign_identifier="one",
width="1000",
Expand All @@ -54,6 +55,7 @@ def test_clean_tsv_cleans_tsv_rows(tmpdir):
call().add_item(
foreign_landing_url="https://example.com/landing2",
image_url="https://example.com/image2",
thumbnail_url="https://example.com/thumbnail2",
license_info=by_nc_license,
foreign_identifier="two",
width="1000",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
"height": 1059,
"id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.reconstructed.tif.jp2",
"image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.reconstructed.tif.jp2/full/!2048,/0/default.jpg",
"thumbnail_url": "https://iip-thumb.smk.dk/iiif/jp2/2227ms627_KKSgb6458.tif.reconstructed.tif.jp2/full/!1024,/0/default.jpg",
"width": 3887
},
{
"filesize": 19269857,
"height": 1576,
"id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2",
"image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!2048,/0/default.jpg",
"thumbnail_url": "https://iip-thumb.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!1024,/0/default.jpg",
"width": 4073
}
]
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
"height": 1059,
"id": "1170012466_object",
"image_url": "https://api.smk.dk/api/v1/thumbnail/52f00edc-936e-42a7-950b-d0cd0df3864b.jpg",
"thumbnail_url": "https://api.smk.dk/api/v1/thumbnail/52f00edc-936e-42a7-950b-d0cd0df3864b.jpg",
"width": 3887
},
{
"filesize": 19269857,
"height": 1576,
"id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2",
"image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!2048,/0/default.jpg",
"thumbnail_url": "https://iip-thumb.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!1024,/0/default.jpg",
"width": 4073
}
]
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"height": 1576,
"id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2",
"image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!2048,/0/default.jpg",
"thumbnail_url": "https://iip-thumb.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!1024,/0/default.jpg",
"width": 4073
}
]
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,6 @@
"image_iiif_id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.reconstructed.tif.jp2",
"image_iiif_info": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.reconstructed.tif.jp2/info.json",
"image_size": 11784886,
"image_thumbnail": "https://iip-thumb.smk.dk/iiif/jp2/2227ms627_KKSgb6458.tif.reconstructed.tif.jp2/full/!1024,/0/default.jpg",
"image_width": 3887
}

0 comments on commit f3799fc

Please sign in to comment.