From f3799fc02c189848082decae60afaca7d9fefa2c Mon Sep 17 00:00:00 2001 From: Krystle Salazar Date: Wed, 7 Dec 2022 17:30:04 -0400 Subject: [PATCH] Reinstate image thumbnail column (#903) --- DAGs.md | 15 ++++++++++++++- openverse_catalog/dags/common/storage/image.py | 10 ++++------ openverse_catalog/dags/common/tsv_cleaner.py | 1 + .../dags/providers/provider_api_scripts/smk.py | 16 ++++++++++++++-- tests/dags/common/storage/test_image.py | 1 - tests/dags/common/storage/test_media.py | 3 +++ tests/dags/common/test_tsv_cleaner.py | 2 ++ .../resources/smk/expected_image_data_hq.json | 2 ++ .../smk/expected_image_data_legacy.json | 2 ++ .../smk/expected_image_data_partial.json | 1 + .../resources/smk/image_data_hq.json | 1 + 11 files changed, 44 insertions(+), 10 deletions(-) diff --git a/DAGs.md b/DAGs.md index 3e0a57d78df..f7887c2e5ca 100644 --- a/DAGs.md +++ b/DAGs.md @@ -79,7 +79,7 @@ The following are DAGs grouped by their primary tag: | [`rawpixel_workflow`](#rawpixel_workflow) | `@monthly` | `False` | image | | [`science_museum_workflow`](#science_museum_workflow) | `@monthly` | `False` | image | | [`smithsonian_workflow`](#smithsonian_workflow) | `@weekly` | `False` | image | -| `smk_workflow` | `@monthly` | `False` | image | +| [`smk_workflow`](#smk_workflow) | `@monthly` | `False` | image | | [`stocksnap_workflow`](#stocksnap_workflow) | `@monthly` | `False` | image | | [`wikimedia_commons_workflow`](#wikimedia_commons_workflow) | `@daily` | `True` | image, audio | | [`wordpress_workflow`](#wordpress_workflow) | `@monthly` | `False` | image | @@ -125,6 +125,7 @@ The following is documentation associated with each DAG (where available): 1. [`report_pending_reported_media`](#report_pending_reported_media) 1. [`science_museum_workflow`](#science_museum_workflow) 1. [`smithsonian_workflow`](#smithsonian_workflow) + 1. [`smk_workflow`](#smk_workflow) 1. [`stocksnap_workflow`](#stocksnap_workflow) 1. [`wikimedia_commons_workflow`](#wikimedia_commons_workflow) 1. [`wikimedia_reingestion_workflow`](#wikimedia_reingestion_workflow) @@ -587,6 +588,18 @@ Output: TSV file containing the images and the respective meta-data. Notes: https://api.si.edu/openaccess/api/v1.0/search +## `smk_workflow` + + +Content Provider: Statens Museum for Kunst (National Gallery of Denmark) + +ETL Process: Use the API to identify all openly licensed media. + +Output: TSV file containing the media metadata. + +Notes: https://www.smk.dk/en/article/smk-api/ + + ## `stocksnap_workflow` diff --git a/openverse_catalog/dags/common/storage/image.py b/openverse_catalog/dags/common/storage/image.py index dc843fc3f70..9a3ceb7ce9f 100644 --- a/openverse_catalog/dags/common/storage/image.py +++ b/openverse_catalog/dags/common/storage/image.py @@ -44,6 +44,7 @@ def add_item( foreign_landing_url: str, image_url: str, license_info: LicenseInfo, + thumbnail_url: str | None = None, filesize: int | None = None, filetype: str | None = None, foreign_identifier: str | None = None, @@ -122,7 +123,7 @@ def add_item( image_data = { "foreign_landing_url": foreign_landing_url, "image_url": image_url, - "thumbnail_url": None, + "thumbnail_url": thumbnail_url, "filesize": filesize, "filetype": filetype, "license_info": license_info, @@ -149,10 +150,6 @@ def _get_image(self, **kwargs) -> Image | None: image_metadata = self.clean_media_metadata(**kwargs) if image_metadata is None: return None - # Set the thumbnail to None to make sure no image provider scripts - # write a value, and to make testing easier by not having to provide - # the value. - image_metadata["thumbnail_url"] = None # Convert the `image_url` key used in ImageStore, TSV and # provider API scripts into `url` key used in db image_metadata["url"] = image_metadata.pop("image_url") @@ -176,6 +173,7 @@ class MockImageStore(ImageStore): """ NULLABLE_FIELDS = [ + "thumbnail_url", "filesize", "filetype", "foreign_identifier", @@ -206,7 +204,7 @@ def __init__( self.media_buffer = [] def add_item(self, **kwargs): - image_data = kwargs | {"thumbnail_url": None} + image_data = kwargs for field in MockImageStore.NULLABLE_FIELDS: if field not in image_data: image_data[field] = None diff --git a/openverse_catalog/dags/common/tsv_cleaner.py b/openverse_catalog/dags/common/tsv_cleaner.py index 134678c6191..03422fc3013 100644 --- a/openverse_catalog/dags/common/tsv_cleaner.py +++ b/openverse_catalog/dags/common/tsv_cleaner.py @@ -42,6 +42,7 @@ def _process_row(tsv_row): image_store.add_item( foreign_landing_url=row_image.foreign_landing_url, image_url=row_image.url, + thumbnail_url=row_image.thumbnail_url, license_info=get_license_info( license_url=get_license_url(row_meta_data), license_=row_image.license_, diff --git a/openverse_catalog/dags/providers/provider_api_scripts/smk.py b/openverse_catalog/dags/providers/provider_api_scripts/smk.py index 9f992d321d6..9e18635d678 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/smk.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/smk.py @@ -1,3 +1,12 @@ +""" +Content Provider: Statens Museum for Kunst (National Gallery of Denmark) + +ETL Process: Use the API to identify all openly licensed media. + +Output: TSV file containing the media metadata. + +Notes: https://www.smk.dk/en/article/smk-api/ +""" import logging from common import constants @@ -53,8 +62,6 @@ def _get_foreign_landing_url(item) -> str | None: def _get_image_url(image_iiif_id: str, image_size=2048): # For high quality IIIF-enabled images, restrict the image size to prevent # loading very large files. - # TODO: consider just using the full "image_native" when adding the - # "image_thumbnail". image_url = f"{image_iiif_id}/full/!{image_size},/0/default.jpg" return image_url @@ -91,6 +98,7 @@ def _get_images(item: dict) -> list: else: image_url = SmkDataIngester._get_image_url(iiif_id) + thumbnail_url = item.get("image_thumbnail") height = item.get("image_height") width = item.get("image_width") filesize = item.get("image_size") or item.get("size") @@ -98,6 +106,7 @@ def _get_images(item: dict) -> list: { "id": image_id, "image_url": image_url, + "thumbnail_url": thumbnail_url, "height": height, "width": width, "filesize": filesize, @@ -114,6 +123,7 @@ def _get_images(item: dict) -> list: # 'id', so we must skip if `iiif_id` is not present. continue image_url = SmkDataIngester._get_image_url(iiif_id) + thumbnail_url = alt_img.get("thumbnail") height = alt_img.get("height") width = alt_img.get("width") filesize = alt_img.get("image_size") or alt_img.get("size") @@ -122,6 +132,7 @@ def _get_images(item: dict) -> list: { "id": iiif_id, "image_url": image_url, + "thumbnail_url": thumbnail_url, "height": height, "width": width, "filesize": filesize, @@ -157,6 +168,7 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None: "foreign_identifier": img.get("id"), "foreign_landing_url": self._get_foreign_landing_url(data), "image_url": img.get("image_url"), + "thumbnail_url": img.get("thumbnail_url"), "license_info": license_info, "title": self._get_title(data), "creator": self._get_creator(data), diff --git a/tests/dags/common/storage/test_image.py b/tests/dags/common/storage/test_image.py index 7f8af00065f..59a40811234 100644 --- a/tests/dags/common/storage/test_image.py +++ b/tests/dags/common/storage/test_image.py @@ -110,7 +110,6 @@ def mock_enrich_tags(tags): args_dict["license_"] = args_dict.get("license_info").license args_dict["license_version"] = args_dict.pop("license_info").version args_dict["url"] = args_dict.pop("image_url") - args_dict["thumbnail_url"] = None assert actual_image == image.Image(**args_dict) diff --git a/tests/dags/common/storage/test_media.py b/tests/dags/common/storage/test_media.py index d2b7ce5128d..264b4ed29a2 100644 --- a/tests/dags/common/storage/test_media.py +++ b/tests/dags/common/storage/test_media.py @@ -293,6 +293,7 @@ def test_MediaStore_get_image_gets_source( license_info=BY_LICENSE_INFO, foreign_landing_url=TEST_FOREIGN_LANDING_URL, image_url=TEST_IMAGE_URL, + thumbnail_url=None, filetype=None, filesize=None, foreign_identifier=None, @@ -350,6 +351,7 @@ def item_saver(arg): license_info=BY_LICENSE_INFO, foreign_landing_url="", image_url="", + thumbnail_url=None, foreign_identifier=None, width=None, height=None, @@ -385,6 +387,7 @@ def item_saver(arg): license_info=LicenseInfo("by", "4.0", valid_license_url, license_url), foreign_landing_url="", image_url="", + thumbnail_url=None, foreign_identifier=None, width=None, height=None, diff --git a/tests/dags/common/test_tsv_cleaner.py b/tests/dags/common/test_tsv_cleaner.py index 21b046383af..c7c641eceb0 100644 --- a/tests/dags/common/test_tsv_cleaner.py +++ b/tests/dags/common/test_tsv_cleaner.py @@ -30,6 +30,7 @@ def test_clean_tsv_cleans_tsv_rows(tmpdir): call().add_item( foreign_landing_url="https://example.com/landing1", image_url="https://example.com/image1", + thumbnail_url="https://example.com/thumbnail1", license_info=by_license, foreign_identifier="one", width="1000", @@ -54,6 +55,7 @@ def test_clean_tsv_cleans_tsv_rows(tmpdir): call().add_item( foreign_landing_url="https://example.com/landing2", image_url="https://example.com/image2", + thumbnail_url="https://example.com/thumbnail2", license_info=by_nc_license, foreign_identifier="two", width="1000", diff --git a/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_hq.json b/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_hq.json index 83f4e81b186..d595e1ba93e 100644 --- a/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_hq.json +++ b/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_hq.json @@ -4,6 +4,7 @@ "height": 1059, "id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.reconstructed.tif.jp2", "image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.reconstructed.tif.jp2/full/!2048,/0/default.jpg", + "thumbnail_url": "https://iip-thumb.smk.dk/iiif/jp2/2227ms627_KKSgb6458.tif.reconstructed.tif.jp2/full/!1024,/0/default.jpg", "width": 3887 }, { @@ -11,6 +12,7 @@ "height": 1576, "id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2", "image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!2048,/0/default.jpg", + "thumbnail_url": "https://iip-thumb.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!1024,/0/default.jpg", "width": 4073 } ] diff --git a/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_legacy.json b/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_legacy.json index 1ab38799855..47f2c9c3955 100644 --- a/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_legacy.json +++ b/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_legacy.json @@ -4,6 +4,7 @@ "height": 1059, "id": "1170012466_object", "image_url": "https://api.smk.dk/api/v1/thumbnail/52f00edc-936e-42a7-950b-d0cd0df3864b.jpg", + "thumbnail_url": "https://api.smk.dk/api/v1/thumbnail/52f00edc-936e-42a7-950b-d0cd0df3864b.jpg", "width": 3887 }, { @@ -11,6 +12,7 @@ "height": 1576, "id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2", "image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!2048,/0/default.jpg", + "thumbnail_url": "https://iip-thumb.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!1024,/0/default.jpg", "width": 4073 } ] diff --git a/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_partial.json b/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_partial.json index e54d62c691f..9d350643497 100644 --- a/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_partial.json +++ b/tests/dags/providers/provider_api_scripts/resources/smk/expected_image_data_partial.json @@ -4,6 +4,7 @@ "height": 1576, "id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2", "image_url": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!2048,/0/default.jpg", + "thumbnail_url": "https://iip-thumb.smk.dk/iiif/jp2/KKSgb6458.tif.jp2/full/!1024,/0/default.jpg", "width": 4073 } ] diff --git a/tests/dags/providers/provider_api_scripts/resources/smk/image_data_hq.json b/tests/dags/providers/provider_api_scripts/resources/smk/image_data_hq.json index 7a9dc49a9e5..a800d5c5515 100644 --- a/tests/dags/providers/provider_api_scripts/resources/smk/image_data_hq.json +++ b/tests/dags/providers/provider_api_scripts/resources/smk/image_data_hq.json @@ -17,5 +17,6 @@ "image_iiif_id": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.reconstructed.tif.jp2", "image_iiif_info": "https://iip.smk.dk/iiif/jp2/KKSgb6458.tif.reconstructed.tif.jp2/info.json", "image_size": 11784886, + "image_thumbnail": "https://iip-thumb.smk.dk/iiif/jp2/2227ms627_KKSgb6458.tif.reconstructed.tif.jp2/full/!1024,/0/default.jpg", "image_width": 3887 }