From c7e1903f8823d4ff40578e0c30c49421ff1ad291 Mon Sep 17 00:00:00 2001 From: rwidom Date: Tue, 16 Aug 2022 13:58:46 -0400 Subject: [PATCH 01/28] very first draft refactor with additional tags --- .../dags/common/loader/provider_details.py | 1 + .../metropolitan_museum.py | 241 +++++++++-------- .../dags/providers/provider_workflows.py | 6 +- .../sample_additional_image_data.json | 99 +++++-- .../sample_image_data.json | 41 +-- .../sample_response_1027_not_cc0.json | 1 + .../test_metropolitan_museum.py | 248 +++++++----------- 7 files changed, 341 insertions(+), 296 deletions(-) create mode 100644 tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_response_1027_not_cc0.json diff --git a/openverse_catalog/dags/common/loader/provider_details.py b/openverse_catalog/dags/common/loader/provider_details.py index aa5dcdb61..ba0d5b213 100644 --- a/openverse_catalog/dags/common/loader/provider_details.py +++ b/openverse_catalog/dags/common/loader/provider_details.py @@ -20,6 +20,7 @@ SMITHSONIAN_DEFAULT_PROVIDER = "smithsonian" BROOKLYN_DEFAULT_PROVIDER = "brooklynmuseum" CLEVELAND_DEFAULT_PROVIDER = "clevelandmuseum" +MET_MUSEUM_DEFAULT_PROVIDER = "met" VICTORIA_DEFAULT_PROVIDER = "museumsvictoria" NYPL_DEFAULT_PROVIDER = "nypl" RAWPIXEL_DEFAULT_PROVIDER = "rawpixel" diff --git a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py index bff094b7d..f46829d08 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py @@ -8,136 +8,147 @@ Notes: https://metmuseum.github.io/ No rate limit specified. + https://metmuseum.github.io/#search + Please limit requests to 80 requests per second, which we + certainly do, but maybe there are additional constraints we + should be aware of. + + There is also a csv file on github, which might be useful. + https://github.com/metmuseum/openaccess """ import argparse import logging from common.licenses import get_license_info -from common.requester import DelayedRequester -from common.storage.image import ImageStore - +from common.loader import provider_details as prov +from provider_data_ingester import ProviderDataIngester -DELAY = 1.0 # time delay (in seconds) -PROVIDER = "met" -ENDPOINT = "https://collectionapi.metmuseum.org/public/collection/v1/objects" -DEFAULT_LICENSE_INFO = get_license_info(license_="cc0", license_version="1.0") logging.basicConfig( format="%(asctime)s - %(name)s - %(levelname)s: %(message)s", level=logging.INFO ) logger = logging.getLogger(__name__) -delayed_requester = DelayedRequester(DELAY) -image_store = ImageStore(provider=PROVIDER) - - -def main(date=None): - """ - This script pulls the data for a given date from the Metropolitan - Museum of Art API, and writes it into a .TSV file to be eventually - read into our DB. - - Required Arguments: - - date: Date String in the form YYYY-MM-DD. This is the date for - which running the script will pull data. - """ - - logger.info(f"Begin: Met Museum API requests for date: {date}") - - fetch_the_object_id = _get_object_ids(date) - if fetch_the_object_id: - logger.info(f"Total object found {fetch_the_object_id[0]}") - _extract_the_data(fetch_the_object_id[1]) - - total_images = image_store.commit() - logger.info(f"Total CC0 images received {total_images}") - - -def _get_object_ids(date, endpoint=ENDPOINT): - query_params = "" - if date: - query_params = {"metadataDate": date} - - response = _get_response_json(query_params, endpoint) - - if response: - total_object_ids = response["total"] - object_ids = response["objectIDs"] - else: - logger.warning("No content available") - return None - return [total_object_ids, object_ids] - - -def _get_response_json( - query_params, - endpoint, - retries=5, -): - response_json = delayed_requester.get_response_json( - endpoint, query_params=query_params, retries=retries - ) - - return response_json - - -def _extract_the_data(object_ids): - for i in object_ids: - _get_data_for_image(i) - - -def _get_data_for_image(object_id): - object_json = _get_and_validate_object_json(object_id) - if not object_json: - logger.warning(f"Could not retrieve object_json for object_id: {object_id}") - return - - main_image = object_json.get("primaryImage") - other_images = object_json.get("additionalImages", []) - image_list = [main_image] + other_images - - meta_data = _create_meta_data(object_json) - - for img in image_list: - foreign_id = _build_foreign_id(object_id, img) - image_store.add_item( - foreign_landing_url=object_json.get("objectURL"), - image_url=img, - license_info=DEFAULT_LICENSE_INFO, - foreign_identifier=foreign_id, - creator=object_json.get("artistDisplayName"), - title=object_json.get("title"), - meta_data=meta_data, +# get a list of object IDs for this week: +# https://collectionapi.metmuseum.org/public/collection/v1/objects?metadataDate=2022-08-10 +# get a specific object: +# https://collectionapi.metmuseum.org/public/collection/v1/objects/1027 +# I wish that we could filter on license in the first call, but seems like we can't. +# The search functionality requires a specific query (term search) in addition to date +# and public domain. It seems like it won't connect with just date and license. +# https://collectionapi.metmuseum.org/public/collection/v1/search?isPublicDomain=true&metadataDate=2022-08-07 +# https://collectionapi.metmuseum.org/public/collection/v1/search?isPublicDomain=true&metadataDate=2022-08-07 +# https://collectionapi.metmuseum.org/public/collection/v1/search?departmentId=1&isPublicDomain=true&metadataDate=2021-08-01 + + +class MetMuseumDataIngester(ProviderDataIngester): + providers = {"image": prov.MET_MUSEUM_DEFAULT_PROVIDER} + endpoint = "https://collectionapi.metmuseum.org/public/collection/v1/objects" + DEFAULT_LICENSE_INFO = get_license_info(license_="cc0", license_version="1.0") + + + def __init__(self, date: str = None): + super(MetMuseumDataIngester, self).__init__(date=date) + self.retries = 5 + + self.query_param = None + if self.date: + self.query_param = {"metadataDate": date} + + # this seems like useful information to track, rather than logging individually + self.object_ids_retrieved = 0 + self.non_cc0_objects = 0 + + def get_next_query_params(self, prev_query_params = None): + return self.query_param + + def get_batch_data(self, response_json): + if response_json: + self.object_ids_retrieved = response_json["total"] + logger.info(f"Total objects found {self.object_ids_retrieved}") + object_ids = response_json["objectIDs"] + else: + logger.warning("No content available") + return None + return object_ids + + def get_record_data(self, object_id): + + object_endpoint = f"{self.endpoint}/{object_id}" + object_json = self.delayed_requester.get_response_json( + object_endpoint, self.retries ) - -def _get_and_validate_object_json(object_id, endpoint=ENDPOINT): - object_endpoint = f"{endpoint}/{object_id}" - object_json = _get_response_json(None, object_endpoint) - if not object_json.get("isPublicDomain"): - logger.warning("CC0 license not detected") - object_json = None - return object_json - - -def _build_foreign_id(object_id, image_url): - unique_identifier = image_url.split("/")[-1].split(".")[0] - return f"{object_id}-{unique_identifier}" - - -def _create_meta_data(object_json): - meta_data = { - "accession_number": object_json.get("accessionNumber"), - "classification": object_json.get("classification"), - "culture": object_json.get("culture"), - "date": object_json.get("objectDate"), - "medium": object_json.get("medium"), - "credit_line": object_json.get("creditLine"), - } - meta_data = {k: v for k, v in meta_data.items() if v is not None} - return meta_data + if object_json.get("isPublicDomain") is False: + self.non_cc0_objects += 1 + if self.non_cc0_objects % self.batch_limit == 0: + logger.info(f"Retrieved {self.non_cc0_objects} non-CC0 records.") + return None + + main_image = object_json.get("primaryImage") + other_images = object_json.get("additionalImages", []) + image_list = [main_image] + other_images + + meta_data = self._create_meta_data(object_json) + raw_tags = self._create_tag_list(object_json) + + return [ + { + "foreign_landing_url": object_json.get("objectURL"), + "image_url": img, + "license_info": self.DEFAULT_LICENSE_INFO, + "foreign_identifier": self._build_foreign_id(object_id, img), + "creator": object_json.get("artistDisplayName"), + "title": object_json.get("title"), + "meta_data": meta_data, + "filetype": img.split(".")[-1], + "raw_tags": raw_tags, + } + for img in image_list + ] + + + def _build_foreign_id(self, object_id:int, image_url:str): + unique_identifier = image_url.split("/")[-1].split(".")[0] + return f"{object_id}-{unique_identifier}" + + def _create_meta_data(self, object_json): + meta_data = None + if object_json.get("accessionNumber") is not None: + meta_data = { + "accession_number": object_json.get("accessionNumber"), + } + return meta_data + + def _create_tag_list(self, object_json): + tag_list = [ + tag + for tag in [ + object_json.get("department"), + object_json.get("medium"), + object_json.get("culture"), + object_json.get("objectName"), + object_json.get("artistDisplayName"), + object_json.get("classification"), + object_json.get("objectDate"), + object_json.get("creditLine"), + ] + if tag is not None and tag != "" + ] + if object_json.get("tags") is not None and object_json.get("tags") != "": + tag_list += [tag["term"] for tag in object_json.get("tags")] + return tag_list + + def get_media_type(self, record): + # This provider only supports Images. + return "image" + + +def main(date: str): + logger.info("Begin: Metropolitan Museum data ingestion") + ingester = MetMuseumDataIngester(date) + ingester.ingest_records() if __name__ == "__main__": diff --git a/openverse_catalog/dags/providers/provider_workflows.py b/openverse_catalog/dags/providers/provider_workflows.py index fd0740796..9cc62b8ae 100644 --- a/openverse_catalog/dags/providers/provider_workflows.py +++ b/openverse_catalog/dags/providers/provider_workflows.py @@ -4,6 +4,7 @@ from providers.provider_api_scripts.cleveland_museum import ClevelandDataIngester from providers.provider_api_scripts.finnish_museums import FinnishMuseumsDataIngester +from providers.provider_api_scripts.metropolitan_museum import MetMuseumDataIngester from providers.provider_api_scripts.museum_victoria import VictoriaDataIngester from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester from providers.provider_api_scripts.science_museum import ScienceMuseumDataIngester @@ -116,7 +117,10 @@ def __post_init__(self): ), ProviderWorkflow( provider_script="metropolitan_museum", - start_date=datetime(2016, 9, 1), + ingester_class=MetMuseumDataIngester, + # changed for testing + # start_date=datetime(2016, 9, 1), + start_date=datetime(2022, 8, 8), schedule_string="@daily", dated=True, pull_timeout=timedelta(hours=12), diff --git a/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_additional_image_data.json b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_additional_image_data.json index 0abff3fd9..bcabe95e9 100644 --- a/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_additional_image_data.json +++ b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_additional_image_data.json @@ -1,18 +1,83 @@ -{ - "creator": "Kiyohara Yukinobu", - "foreign_identifier": "45734-1", - "foreign_landing_url": "https://www.metmuseum.org/art/collection/search/45734", - "image_url": "https://images.metmuseum.org/CRDImages/as/original/DP251120.jpg", - "license_": "cc0", - "license_version": "1.0", - "meta_data": { - "accession_number": "36.100.45", - "classification": "Paintings", - "credit_line": "The Howard Mansfield Collection, Purchase, Rogers Fund, 1936", - "culture": "Japan", - "date": "late 17th century", - "medium": "Hanging scroll; ink and color on silk" +[ + { + "creator": "Kiyohara Yukinobu", + "foreign_identifier": "45734-DP251139", + "foreign_landing_url": "https://wwwstg.metmuseum.org/art/collection/search/45734", + "image_url": "https://images.metmuseum.org/CRDImages/as/original/DP251139.jpg", + "license_info": { + "license": "cc0", + "version": "1.0", + "url": "https://creativecommons.org/publicdomain/zero/1.0/" + }, + "meta_data": { + "accession_number": "36.100.45" + }, + "raw_tags": [ + "Asian Art", + "Hanging scroll; ink and color on silk", + "Japan", + "Hanging scroll", + "Kiyohara Yukinobu", + "Paintings", + "late 17th century", + "The Howard Mansfield Collection, Purchase, Rogers Fund, 1936", + "Birds", + "Leaves" + ], + "title": "Quail and Millet" }, - "thumbnail_url": "", - "title": "Quail and Millet" -} + { + "creator": "Kiyohara Yukinobu", + "foreign_identifier": "45734-DP251138", + "foreign_landing_url": "https://wwwstg.metmuseum.org/art/collection/search/45734", + "image_url": "https://images.metmuseum.org/CRDImages/as/original/DP251138.jpg", + "license_info": { + "license": "cc0", + "version": "1.0", + "url": "https://creativecommons.org/publicdomain/zero/1.0/" + }, + "meta_data": { + "accession_number": "36.100.45" + }, + "raw_tags": [ + "Asian Art", + "Hanging scroll; ink and color on silk", + "Japan", + "Hanging scroll", + "Kiyohara Yukinobu", + "Paintings", + "late 17th century", + "The Howard Mansfield Collection, Purchase, Rogers Fund, 1936", + "Birds", + "Leaves" + ], + "title": "Quail and Millet" + }, + { + "creator": "Kiyohara Yukinobu", + "foreign_identifier": "45734-DP251120", + "foreign_landing_url": "https://wwwstg.metmuseum.org/art/collection/search/45734", + "image_url": "https://images.metmuseum.org/CRDImages/as/original/DP251120.jpg", + "license_info": { + "license": "cc0", + "version": "1.0", + "url": "https://creativecommons.org/publicdomain/zero/1.0/" + }, + "meta_data": { + "accession_number": "36.100.45" + }, + "raw_tags": [ + "Asian Art", + "Hanging scroll; ink and color on silk", + "Japan", + "Hanging scroll", + "Kiyohara Yukinobu", + "Paintings", + "late 17th century", + "The Howard Mansfield Collection, Purchase, Rogers Fund, 1936", + "Birds", + "Leaves" + ], + "title": "Quail and Millet" + } +] diff --git a/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_image_data.json b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_image_data.json index 1a0573c22..a4e6a61d9 100644 --- a/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_image_data.json +++ b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_image_data.json @@ -1,16 +1,25 @@ -{ - "creator": "Kiyohara Yukinobu", - "foreign_identifier": 45734, - "foreign_landing_url": "https://wwwstg.metmuseum.org/art/collection/search/45734", - "image_url": "https://images.metmuseum.org/CRDImages/as/original/DP251139.jpg", - "license": "cc0", - "license_version": "1.0", - "meta_data": { - "accession_number": "36.100.45", - "classification": "Paintings", - "credit_line": "The Howard Mansfield Collection, Purchase, Rogers Fund, 1936", - "culture": "Japan" - }, - "thumbnail_url": "https://images.metmuseum.org/CRDImages/as/web-large/DP251139.jpg", - "title": "Quail and Millet" -} +[ + { + "creator": "", + "foreign_identifier": "47533-79_2_414b_S1_sf", + "foreign_landing_url": "https://www.metmuseum.org/art/collection/search/47533", + "image_url": "https://images.metmuseum.org/CRDImages/as/original/79_2_414b_S1_sf.jpg", + "license_info": { + "license": "cc0", + "version": "1.0", + "url": "https://creativecommons.org/publicdomain/zero/1.0/" + }, + "meta_data": { + "accession_number": "79.2.414b" + }, + "raw_tags": [ + "Asian Art", + "Porcelain painted in underglaze blue", + "China", + "Cover", + "Ceramics", + "Purchase by subscription, 1879" + ], + "title": "Cover" + } +] diff --git a/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_response_1027_not_cc0.json b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_response_1027_not_cc0.json new file mode 100644 index 000000000..975dde08e --- /dev/null +++ b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_response_1027_not_cc0.json @@ -0,0 +1 @@ +{"objectID":1027,"isHighlight":false,"accessionNumber":"67.35.2","accessionYear":"1967","isPublicDomain":false,"primaryImage":"","primaryImageSmall":"","additionalImages":[],"constituents":[{"constituentID":162583,"role":"Maker","name":"Unger Brothers","constituentULAN_URL":"http://vocab.getty.edu/page/ulan/500490017","constituentWikidata_URL":"https://www.wikidata.org/wiki/Q98065799","gender":""}],"department":"The American Wing","objectName":"Brooch","title":"Brooch","culture":"American","period":"","dynasty":"","reign":"","portfolio":"","artistRole":"Maker","artistPrefix":"","artistDisplayName":"Unger Brothers","artistDisplayBio":"1872–1919","artistSuffix":"","artistAlphaSort":"Unger Brothers","artistNationality":"","artistBeginDate":"1872","artistEndDate":"1919","artistGender":"","artistWikidata_URL":"https://www.wikidata.org/wiki/Q98065799","artistULAN_URL":"http://vocab.getty.edu/page/ulan/500490017","objectDate":"ca. 1900","objectBeginDate":1900,"objectEndDate":1905,"medium":"Silver","dimensions":"11/16 x 2 7/16 x 2 5/8 in. (1.7 x 6.2 x 6.7 cm); 2 dwt. (28.7 g)","measurements":[{"elementName":"Overall","elementDescription":null,"elementMeasurements":{"Depth":6.6675,"Height":1.7463,"Weight":0.0287,"Width":6.1913}}],"creditLine":"Gift of Ronald S. Kane, 1967","geographyType":"Made in","city":"Newark","state":"New Jersey","county":"","country":"United States","region":"Mid-Atlantic","subregion":"","locale":"","locus":"","excavation":"","river":"","classification":"","rightsAndReproduction":"","linkResource":"","metadataDate":"2022-08-12T04:53:21.933Z","repository":"Metropolitan Museum of Art, New York, NY","objectURL":"https://www.metmuseum.org/art/collection/search/1027","tags":null,"objectWikidata_URL":"","isTimelineWork":false,"GalleryNumber":"706"} \ No newline at end of file diff --git a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py index 7967770c5..174c7cb50 100644 --- a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py +++ b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py @@ -6,181 +6,135 @@ import pytest import requests from common.licenses import LicenseInfo -from providers.provider_api_scripts import metropolitan_museum as mma +from providers.provider_api_scripts.metropolitan_museum import MetMuseumDataIngester -RESOURCES = Path(__file__).parent / "resources/metropolitan_museum_of_art" - logging.basicConfig( format="%(asctime)s - %(name)s - %(levelname)s: %(message)s", level=logging.INFO ) logger = logging.getLogger(__name__) +mma = MetMuseumDataIngester() + +RESOURCES = Path(__file__).parent / "resources/metropolitan_museum_of_art" +print(RESOURCES) + +# abbreviated response without other images 45733 +single_object_response = json.loads( + (RESOURCES / "sample_response_without_additional.json").read_text() +) +# single expected record if 45733 with no additional images +single_expected_data = json.loads((RESOURCES / "sample_image_data.json").read_text()) + +# response for objectid 45734 with 2 additional image urls +full_object_response = json.loads((RESOURCES / "sample_response.json").read_text()) +# 3 expected image records for objectid 45734 +full_expected_data = json.loads( + (RESOURCES / "sample_additional_image_data.json").read_text() +) + +# Not CC0 +ineligible_response = json.loads( + (RESOURCES / "sample_response_1027_not_cc0.json").read_text() +) + + CC0 = LicenseInfo( "cc0", "1.0", "https://creativecommons.org/publicdomain/zero/1.0/", None ) -def test_get_object_ids(): - r = requests.Response() - r.status_code = 200 - r.json = MagicMock(return_value={"total": 4, "objectIDs": [153, 1578, 465, 546]}) - with patch.object(mma.delayed_requester, "get", return_value=r): - total_objects = mma._get_object_ids("") - assert total_objects[0] == 4 - assert total_objects[1] == [153, 1578, 465, 546] - - -def test_get_response_json(): - expect_json = {"it": "works"} - endpoint = "https://abc.com" - query_params = {"a": "b"} - retries = 2 - with patch.object( - mma.delayed_requester, "get_response_json", return_value=expect_json - ) as mock_get_response_json: - r_json = mma._get_response_json(query_params, endpoint, retries=retries) - - assert r_json == expect_json - mock_get_response_json.assert_called_once_with( - endpoint, query_params=query_params, retries=retries - ) +@pytest.mark.parametrize( + "test_date, expected", + [ + ("2022-07-01", {"metadataDate": "2022-07-01"}), + (None, None) + ], +) +def test_get_next_query_params(test_date, expected): + ingester = MetMuseumDataIngester(test_date) + actual = ingester.get_next_query_params() + assert actual == expected + + +@pytest.mark.parametrize( + "response_json, expected", + [ + ({"total": 4, "objectIDs": [153, 1578, 465, 546]}, [153, 1578, 465, 546]), + (None, None) + ], +) +def test_get_batch_data(response_json, expected): + actual = mma.get_batch_data(response_json) + assert actual == expected -def test_create_meta_data(): - exact_response = { - "accessionNumber": "36.100.45", - "classification": "Paintings", - "creditLine": "The Howard Mansfield Collection, Purchase, Rogers Fund, 1936", - "culture": "Japan", - "objectDate": "late 17th century", - "medium": "Hanging scroll; ink and color on silk", - } - exact_meta_data = { - "accession_number": "36.100.45", - "classification": "Paintings", - "credit_line": "The Howard Mansfield Collection, Purchase, Rogers Fund, 1936", - "culture": "Japan", - "date": "late 17th century", - "medium": "Hanging scroll; ink and color on silk", - } - r = requests.Response() - r.status_code = 200 - r.json = MagicMock(return_value=exact_response) - with patch.object(mma.delayed_requester, "get", return_value=r): - response = mma._get_response_json(None, "", retries=2) - meta_data = mma._create_meta_data(response) +@pytest.mark.parametrize( + "case_name, response_json, expected", + [ + ("single image", single_object_response, single_expected_data[0].get("meta_data")), + ("multi images", full_object_response, full_expected_data[0].get("meta_data")), + ], +) +def test_create_meta_data(case_name, response_json, expected): + actual = mma._create_meta_data(response_json) + assert expected == actual - assert exact_meta_data == meta_data + +@pytest.mark.parametrize( + "case_name, response_json, expected", + [ + ("single image", single_object_response, single_expected_data[0].get("raw_tags")), + ("multi images", full_object_response, full_expected_data[0].get("raw_tags")), + ], +) +def test_create_tag_list(case_name, response_json, expected): + actual = mma._create_tag_list(response_json) + assert expected == actual -def test_get_data_for_image_with_none_response(): +def test_get_record_data_with_none_response(): with patch.object(mma.delayed_requester, "get", return_value=None) as mock_get: with pytest.raises(Exception): - assert mma._get_data_for_image(10) - + assert mma.get_record_data(10) assert mock_get.call_count == 6 -def test_get_data_for_image_with_non_ok(): +def test_get_record_data_with_non_ok(): r = requests.Response() r.status_code = 504 r.json = MagicMock(return_value={}) with patch.object(mma.delayed_requester, "get", return_value=r) as mock_get: with pytest.raises(Exception): - assert mma._get_data_for_image(10) - + assert mma.get_record_data(10) assert mock_get.call_count == 6 -def test_get_data_for_image_returns_response_json_when_all_ok(monkeypatch): - with open(RESOURCES / "sample_response_without_additional.json") as f: - actual_response_json = json.load(f) - - def mock_get_response_json(query_params, retries=0): - return actual_response_json - - monkeypatch.setattr(mma, "_get_response_json", mock_get_response_json) - with open(RESOURCES / "sample_additional_image_data.json") as f: - image_data = json.load(f) - - r = requests.Response() - r.status_code = 200 - r.json = MagicMock(return_value=image_data) - with patch.object( - mma.image_store, "save_item", return_value=image_data - ) as mock_save: - mma._get_data_for_image(45733) - - actual_image = mock_save.call_args[0][0] - - expected_image = { - "creator": "", - "foreign_identifier": "45733-79_2_414b_S1_sf", - "foreign_landing_url": "https://www.metmuseum.org/art/collection/search/47533", - "url": "https://images.metmuseum.org/CRDImages/as/original/79_2_414b_S1_sf.jpg", - "license_": CC0.license, - "license_version": CC0.version, - "filetype": "jpg", - "filesize": None, - "meta_data": { - "license_url": CC0.url, - "raw_license_url": CC0.raw_url, - "accession_number": "79.2.414b", - "classification": "Ceramics", - "culture": "China", - "date": "", - "medium": "Porcelain painted in underglaze blue", - "credit_line": "Purchase by subscription, 1879", - }, - "title": "Cover", - } - - assert mock_save.call_count == 1 - for key, value in expected_image.items(): - assert getattr(actual_image, key) == value - - -def test_get_data_for_image_returns_response_json_with_additional_images(monkeypatch): - with open(RESOURCES / "sample_response.json") as f: - actual_response_json = json.load(f) - - def mock_get_response_json(query_params, retries=0): - return actual_response_json - - monkeypatch.setattr(mma, "_get_response_json", mock_get_response_json) - with open(RESOURCES / "sample_additional_image_data.json") as f: - image_data = json.load(f) - - r = requests.Response() - r.status_code = 200 - r.json = MagicMock(return_value=image_data) - with patch.object(mma.image_store, "add_item", return_value=image_data) as mock_add: - mma._get_data_for_image(45734) - - mock_add.assert_called_with( - creator="Kiyohara Yukinobu", - foreign_identifier="45734-DP251120", - foreign_landing_url=( - "https://wwwstg.metmuseum.org/art/collection/search/45734" - ), - image_url="https://images.metmuseum.org/CRDImages/as/original/DP251120.jpg", - license_info=( - LicenseInfo( - "cc0", "1.0", "https://creativecommons.org/publicdomain/zero/1.0/", None - ) - ), - meta_data={ - "accession_number": "36.100.45", - "classification": "Paintings", - "culture": "Japan", - "date": "late 17th century", - "medium": "Hanging scroll; ink and color on silk", - "credit_line": ( - "The Howard Mansfield Collection, Purchase, Rogers Fund, 1936" - ), - }, - title="Quail and Millet", +@pytest.mark.parametrize( + "case_name, response_json, expected", + [ + ("single image", single_object_response, single_expected_data), + ("multi image", full_object_response, full_expected_data), + ("not cc0", ineligible_response, None) + ], +) +def test_get_record_data_returns_response_json_when_all_ok( + case_name, response_json, expected, monkeypatch +): + monkeypatch.setattr( + mma.delayed_requester, + "get_response_json", + lambda x, y: response_json ) - - assert mock_add.call_count == 3 + actual = mma.get_record_data(response_json.get("objectID")) + + if expected is None: + assert actual is None + else: + assert len(actual) == len(expected) + for i in range(len(actual)): + for key, value in expected[i].items(): + if key == "license_info": + assert actual[i].get(key) == CC0 + else: + assert actual[i].get(key) == value From eca69f5b8e8004df7ac5775723d3fdf91ec45c9e Mon Sep 17 00:00:00 2001 From: rwidom Date: Tue, 16 Aug 2022 14:01:16 -0400 Subject: [PATCH 02/28] linting --- .../metropolitan_museum.py | 8 +- .../sample_additional_image_data.json | 18 ++--- .../sample_image_data.json | 6 +- .../sample_response_1027_not_cc0.json | 80 ++++++++++++++++++- .../test_metropolitan_museum.py | 25 +++--- 5 files changed, 108 insertions(+), 29 deletions(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py index f46829d08..e40f351b7 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py @@ -47,7 +47,6 @@ class MetMuseumDataIngester(ProviderDataIngester): endpoint = "https://collectionapi.metmuseum.org/public/collection/v1/objects" DEFAULT_LICENSE_INFO = get_license_info(license_="cc0", license_version="1.0") - def __init__(self, date: str = None): super(MetMuseumDataIngester, self).__init__(date=date) self.retries = 5 @@ -60,7 +59,7 @@ def __init__(self, date: str = None): self.object_ids_retrieved = 0 self.non_cc0_objects = 0 - def get_next_query_params(self, prev_query_params = None): + def get_next_query_params(self, prev_query_params=None): return self.query_param def get_batch_data(self, response_json): @@ -108,11 +107,10 @@ def get_record_data(self, object_id): for img in image_list ] - - def _build_foreign_id(self, object_id:int, image_url:str): + def _build_foreign_id(self, object_id: int, image_url: str): unique_identifier = image_url.split("/")[-1].split(".")[0] return f"{object_id}-{unique_identifier}" - + def _create_meta_data(self, object_json): meta_data = None if object_json.get("accessionNumber") is not None: diff --git a/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_additional_image_data.json b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_additional_image_data.json index bcabe95e9..7da440276 100644 --- a/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_additional_image_data.json +++ b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_additional_image_data.json @@ -5,9 +5,9 @@ "foreign_landing_url": "https://wwwstg.metmuseum.org/art/collection/search/45734", "image_url": "https://images.metmuseum.org/CRDImages/as/original/DP251139.jpg", "license_info": { - "license": "cc0", - "version": "1.0", - "url": "https://creativecommons.org/publicdomain/zero/1.0/" + "license": "cc0", + "url": "https://creativecommons.org/publicdomain/zero/1.0/", + "version": "1.0" }, "meta_data": { "accession_number": "36.100.45" @@ -32,9 +32,9 @@ "foreign_landing_url": "https://wwwstg.metmuseum.org/art/collection/search/45734", "image_url": "https://images.metmuseum.org/CRDImages/as/original/DP251138.jpg", "license_info": { - "license": "cc0", - "version": "1.0", - "url": "https://creativecommons.org/publicdomain/zero/1.0/" + "license": "cc0", + "url": "https://creativecommons.org/publicdomain/zero/1.0/", + "version": "1.0" }, "meta_data": { "accession_number": "36.100.45" @@ -59,9 +59,9 @@ "foreign_landing_url": "https://wwwstg.metmuseum.org/art/collection/search/45734", "image_url": "https://images.metmuseum.org/CRDImages/as/original/DP251120.jpg", "license_info": { - "license": "cc0", - "version": "1.0", - "url": "https://creativecommons.org/publicdomain/zero/1.0/" + "license": "cc0", + "url": "https://creativecommons.org/publicdomain/zero/1.0/", + "version": "1.0" }, "meta_data": { "accession_number": "36.100.45" diff --git a/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_image_data.json b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_image_data.json index a4e6a61d9..380443930 100644 --- a/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_image_data.json +++ b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_image_data.json @@ -5,9 +5,9 @@ "foreign_landing_url": "https://www.metmuseum.org/art/collection/search/47533", "image_url": "https://images.metmuseum.org/CRDImages/as/original/79_2_414b_S1_sf.jpg", "license_info": { - "license": "cc0", - "version": "1.0", - "url": "https://creativecommons.org/publicdomain/zero/1.0/" + "license": "cc0", + "url": "https://creativecommons.org/publicdomain/zero/1.0/", + "version": "1.0" }, "meta_data": { "accession_number": "79.2.414b" diff --git a/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_response_1027_not_cc0.json b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_response_1027_not_cc0.json index 975dde08e..d8e46cbd6 100644 --- a/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_response_1027_not_cc0.json +++ b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_response_1027_not_cc0.json @@ -1 +1,79 @@ -{"objectID":1027,"isHighlight":false,"accessionNumber":"67.35.2","accessionYear":"1967","isPublicDomain":false,"primaryImage":"","primaryImageSmall":"","additionalImages":[],"constituents":[{"constituentID":162583,"role":"Maker","name":"Unger Brothers","constituentULAN_URL":"http://vocab.getty.edu/page/ulan/500490017","constituentWikidata_URL":"https://www.wikidata.org/wiki/Q98065799","gender":""}],"department":"The American Wing","objectName":"Brooch","title":"Brooch","culture":"American","period":"","dynasty":"","reign":"","portfolio":"","artistRole":"Maker","artistPrefix":"","artistDisplayName":"Unger Brothers","artistDisplayBio":"1872–1919","artistSuffix":"","artistAlphaSort":"Unger Brothers","artistNationality":"","artistBeginDate":"1872","artistEndDate":"1919","artistGender":"","artistWikidata_URL":"https://www.wikidata.org/wiki/Q98065799","artistULAN_URL":"http://vocab.getty.edu/page/ulan/500490017","objectDate":"ca. 1900","objectBeginDate":1900,"objectEndDate":1905,"medium":"Silver","dimensions":"11/16 x 2 7/16 x 2 5/8 in. (1.7 x 6.2 x 6.7 cm); 2 dwt. (28.7 g)","measurements":[{"elementName":"Overall","elementDescription":null,"elementMeasurements":{"Depth":6.6675,"Height":1.7463,"Weight":0.0287,"Width":6.1913}}],"creditLine":"Gift of Ronald S. Kane, 1967","geographyType":"Made in","city":"Newark","state":"New Jersey","county":"","country":"United States","region":"Mid-Atlantic","subregion":"","locale":"","locus":"","excavation":"","river":"","classification":"","rightsAndReproduction":"","linkResource":"","metadataDate":"2022-08-12T04:53:21.933Z","repository":"Metropolitan Museum of Art, New York, NY","objectURL":"https://www.metmuseum.org/art/collection/search/1027","tags":null,"objectWikidata_URL":"","isTimelineWork":false,"GalleryNumber":"706"} \ No newline at end of file +{ + "GalleryNumber": "706", + "accessionNumber": "67.35.2", + "accessionYear": "1967", + "additionalImages": [], + "artistAlphaSort": "Unger Brothers", + "artistBeginDate": "1872", + "artistDisplayBio": "1872\u20131919", + "artistDisplayName": "Unger Brothers", + "artistEndDate": "1919", + "artistGender": "", + "artistNationality": "", + "artistPrefix": "", + "artistRole": "Maker", + "artistSuffix": "", + "artistULAN_URL": "http://vocab.getty.edu/page/ulan/500490017", + "artistWikidata_URL": "https://www.wikidata.org/wiki/Q98065799", + "city": "Newark", + "classification": "", + "constituents": [ + { + "constituentID": 162583, + "constituentULAN_URL": "http://vocab.getty.edu/page/ulan/500490017", + "constituentWikidata_URL": "https://www.wikidata.org/wiki/Q98065799", + "gender": "", + "name": "Unger Brothers", + "role": "Maker" + } + ], + "country": "United States", + "county": "", + "creditLine": "Gift of Ronald S. Kane, 1967", + "culture": "American", + "department": "The American Wing", + "dimensions": "11/16 x 2 7/16 x 2 5/8 in. (1.7 x 6.2 x 6.7 cm); 2 dwt. (28.7 g)", + "dynasty": "", + "excavation": "", + "geographyType": "Made in", + "isHighlight": false, + "isPublicDomain": false, + "isTimelineWork": false, + "linkResource": "", + "locale": "", + "locus": "", + "measurements": [ + { + "elementDescription": null, + "elementMeasurements": { + "Depth": 6.6675, + "Height": 1.7463, + "Weight": 0.0287, + "Width": 6.1913 + }, + "elementName": "Overall" + } + ], + "medium": "Silver", + "metadataDate": "2022-08-12T04:53:21.933Z", + "objectBeginDate": 1900, + "objectDate": "ca. 1900", + "objectEndDate": 1905, + "objectID": 1027, + "objectName": "Brooch", + "objectURL": "https://www.metmuseum.org/art/collection/search/1027", + "objectWikidata_URL": "", + "period": "", + "portfolio": "", + "primaryImage": "", + "primaryImageSmall": "", + "region": "Mid-Atlantic", + "reign": "", + "repository": "Metropolitan Museum of Art, New York, NY", + "rightsAndReproduction": "", + "river": "", + "state": "New Jersey", + "subregion": "", + "tags": null, + "title": "Brooch" +} diff --git a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py index 174c7cb50..c4af0e932 100644 --- a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py +++ b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py @@ -46,10 +46,7 @@ @pytest.mark.parametrize( "test_date, expected", - [ - ("2022-07-01", {"metadataDate": "2022-07-01"}), - (None, None) - ], + [("2022-07-01", {"metadataDate": "2022-07-01"}), (None, None)], ) def test_get_next_query_params(test_date, expected): ingester = MetMuseumDataIngester(test_date) @@ -61,7 +58,7 @@ def test_get_next_query_params(test_date, expected): "response_json, expected", [ ({"total": 4, "objectIDs": [153, 1578, 465, 546]}, [153, 1578, 465, 546]), - (None, None) + (None, None), ], ) def test_get_batch_data(response_json, expected): @@ -72,7 +69,11 @@ def test_get_batch_data(response_json, expected): @pytest.mark.parametrize( "case_name, response_json, expected", [ - ("single image", single_object_response, single_expected_data[0].get("meta_data")), + ( + "single image", + single_object_response, + single_expected_data[0].get("meta_data"), + ), ("multi images", full_object_response, full_expected_data[0].get("meta_data")), ], ) @@ -84,7 +85,11 @@ def test_create_meta_data(case_name, response_json, expected): @pytest.mark.parametrize( "case_name, response_json, expected", [ - ("single image", single_object_response, single_expected_data[0].get("raw_tags")), + ( + "single image", + single_object_response, + single_expected_data[0].get("raw_tags"), + ), ("multi images", full_object_response, full_expected_data[0].get("raw_tags")), ], ) @@ -115,16 +120,14 @@ def test_get_record_data_with_non_ok(): [ ("single image", single_object_response, single_expected_data), ("multi image", full_object_response, full_expected_data), - ("not cc0", ineligible_response, None) + ("not cc0", ineligible_response, None), ], ) def test_get_record_data_returns_response_json_when_all_ok( case_name, response_json, expected, monkeypatch ): monkeypatch.setattr( - mma.delayed_requester, - "get_response_json", - lambda x, y: response_json + mma.delayed_requester, "get_response_json", lambda x, y: response_json ) actual = mma.get_record_data(response_json.get("objectID")) From f1988ce0287fa55fb3481047116f5d008b85346c Mon Sep 17 00:00:00 2001 From: rwidom Date: Wed, 17 Aug 2022 04:37:42 -0400 Subject: [PATCH 03/28] update dag docs --- DAGs.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/DAGs.md b/DAGs.md index fc48f9777..69553b367 100644 --- a/DAGs.md +++ b/DAGs.md @@ -325,6 +325,13 @@ Output: TSV file containing the image, their respective Notes: https://metmuseum.github.io/ No rate limit specified. + https://metmuseum.github.io/#search + Please limit requests to 80 requests per second, which we + certainly do, but maybe there are additional constraints we + should be aware of. + + There is also a csv file on github, which might be useful. + https://github.com/metmuseum/openaccess From 62a02ffb4a4899eb7160c71f41a3425f1f7b9fa4 Mon Sep 17 00:00:00 2001 From: rwidom Date: Wed, 17 Aug 2022 07:42:24 -0400 Subject: [PATCH 04/28] address issue 641 missing titles --- .../metropolitan_museum.py | 31 ++++++++++--------- .../test_metropolitan_museum.py | 13 ++++++++ 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py index e40f351b7..f2f56f346 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py @@ -6,15 +6,13 @@ Output: TSV file containing the image, their respective meta-data. -Notes: https://metmuseum.github.io/ - No rate limit specified. - https://metmuseum.github.io/#search - Please limit requests to 80 requests per second, which we - certainly do, but maybe there are additional constraints we - should be aware of. - - There is also a csv file on github, which might be useful. - https://github.com/metmuseum/openaccess +Notes: https://metmuseum.github.io/#search + "Please limit requests to 80 requests per second." Changing + delay to 3 seconds, because of blocking encountered during + development. + + Some analysis to improve data quality was conducted using a + separate csv file here: https://github.com/metmuseum/openaccess """ import argparse @@ -34,12 +32,9 @@ # https://collectionapi.metmuseum.org/public/collection/v1/objects?metadataDate=2022-08-10 # get a specific object: # https://collectionapi.metmuseum.org/public/collection/v1/objects/1027 -# I wish that we could filter on license in the first call, but seems like we can't. # The search functionality requires a specific query (term search) in addition to date # and public domain. It seems like it won't connect with just date and license. # https://collectionapi.metmuseum.org/public/collection/v1/search?isPublicDomain=true&metadataDate=2022-08-07 -# https://collectionapi.metmuseum.org/public/collection/v1/search?isPublicDomain=true&metadataDate=2022-08-07 -# https://collectionapi.metmuseum.org/public/collection/v1/search?departmentId=1&isPublicDomain=true&metadataDate=2021-08-01 class MetMuseumDataIngester(ProviderDataIngester): @@ -50,6 +45,7 @@ class MetMuseumDataIngester(ProviderDataIngester): def __init__(self, date: str = None): super(MetMuseumDataIngester, self).__init__(date=date) self.retries = 5 + self.delayed_requester._DELAY = 3 self.query_param = None if self.date: @@ -66,11 +62,10 @@ def get_batch_data(self, response_json): if response_json: self.object_ids_retrieved = response_json["total"] logger.info(f"Total objects found {self.object_ids_retrieved}") - object_ids = response_json["objectIDs"] + return response_json["objectIDs"] else: logger.warning("No content available") return None - return object_ids def get_record_data(self, object_id): @@ -99,7 +94,7 @@ def get_record_data(self, object_id): "license_info": self.DEFAULT_LICENSE_INFO, "foreign_identifier": self._build_foreign_id(object_id, img), "creator": object_json.get("artistDisplayName"), - "title": object_json.get("title"), + "title": self._create_title(object_json), "meta_data": meta_data, "filetype": img.split(".")[-1], "raw_tags": raw_tags, @@ -138,6 +133,12 @@ def _create_tag_list(self, object_json): tag_list += [tag["term"] for tag in object_json.get("tags")] return tag_list + def _create_title(self, record): + if record.get("title"): + return record.get("title") + else: + return record.get("objectName") + def get_media_type(self, record): # This provider only supports Images. return "image" diff --git a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py index c4af0e932..a59e04564 100644 --- a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py +++ b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py @@ -98,6 +98,19 @@ def test_create_tag_list(case_name, response_json, expected): assert expected == actual +@pytest.mark.parametrize( + "response_json, expected", + [ + ({"title": "Yes, regular case", "objectName": "Wrong"}, "Yes, regular case"), + ({"objectName": "Yes, no title at all"}, "Yes, no title at all"), + ({"title": "", "objectName": "Yes, empty title"}, "Yes, empty title"), + ], +) +def test_create_title(response_json, expected): + actual = mma._create_title(response_json) + assert actual == expected + + def test_get_record_data_with_none_response(): with patch.object(mma.delayed_requester, "get", return_value=None) as mock_get: with pytest.raises(Exception): From 0de8ae750effe2381ea7e4e4a87c4969e39a768e Mon Sep 17 00:00:00 2001 From: rwidom Date: Wed, 17 Aug 2022 07:43:54 -0400 Subject: [PATCH 05/28] update notes in dag docs --- DAGs.md | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/DAGs.md b/DAGs.md index 69553b367..831383edf 100644 --- a/DAGs.md +++ b/DAGs.md @@ -323,15 +323,13 @@ ETL Process: Use the API to identify all CC0 artworks. Output: TSV file containing the image, their respective meta-data. -Notes: https://metmuseum.github.io/ - No rate limit specified. - https://metmuseum.github.io/#search - Please limit requests to 80 requests per second, which we - certainly do, but maybe there are additional constraints we - should be aware of. +Notes: https://metmuseum.github.io/#search + "Please limit requests to 80 requests per second." Changing + delay to 3 seconds, because of blocking encountered during + development. - There is also a csv file on github, which might be useful. - https://github.com/metmuseum/openaccess + Some analysis to improve data quality was conducted using a + separate csv file here: https://github.com/metmuseum/openaccess From db7a861fd13d6e9d8cd59a2c27678fecffad3314 Mon Sep 17 00:00:00 2001 From: rwidom Date: Wed, 17 Aug 2022 19:52:40 -0400 Subject: [PATCH 06/28] csv handling for data quality analysis --- .../read_met_csv.py | 158 ++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/read_met_csv.py diff --git a/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/read_met_csv.py b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/read_met_csv.py new file mode 100644 index 000000000..a4e557a96 --- /dev/null +++ b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/read_met_csv.py @@ -0,0 +1,158 @@ +""" +Ad hoc exploratory analysis for identifying test cases for the API workflow. +To execute this, download MetObjects.csv from here: +https://github.com/metmuseum/openaccess/blob/master/MetObjects.csv +""" + +from pathlib import Path +from time import sleep + +import pandas as pd +import requests + + +pd.set_option("display.min_rows", 500) +pd.set_option("display.width", None) + +DIR = Path(__file__).parent +raw_data_file_name = "MetObjects.csv" + +raw_data = pd.read_csv( + DIR / raw_data_file_name, + header=0, + usecols=[ + "Is Public Domain", + "Object ID", + "Object Number", + "Department", + "Object Name", + "Title", + "Culture", + "Artist Display Name", + "Medium", + "Credit Line", + "Classification", + "Link Resource", + "Tags", + "Period", + "Object Date", + "artistWikidata_URL", + "artistULAN_URL", + ], +) + +# *** object_number is never missing. +# is_highlight is never missing. +# is_timeline_work is never missing. +# gallery_number is missing 85% of values, see 860873 +# *** department is never missing. +# accessionyear is missing 0% of values, see 854659 +# *** object_name is missing 0% of values, see 854659 +# *** title is missing 10% of values, see 856958 +# *** culture is missing 42% of values, see 860873 +# --> period is missing 71% of values, see 860873 +# dynasty is missing 95% of values, see 860873 +# reign is missing 97% of values, see 860873 +# portfolio is missing 96% of values, see 860873 +# constituent_id is missing 56% of values, see 857718 +# artist_role is missing 57% of values, see 857718 +# artist_prefix is missing 56% of values, see 857718 +# *** artist_display_name is missing 56% of values, see 857718 +# artist_display_bio is missing 57% of values, see 857718 +# artist_suffix is missing 56% of values, see 857718 +# artist_alpha_sort is missing 56% of values, see 857718 +# artist_nationality is missing 56% of values, see 857718 +# artist_begin_date is missing 56% of values, see 857718 +# artist_end_date is missing 56% of values, see 857718 +# artist_gender is missing 86% of values, see 860873 +# artist_ulan_url is missing 64% of values, see 860873 +# artist_wikidata_url is missing 65% of values, see 860873 +# --> object_date is missing 3% of values, see 856790 +# object_begin_date is never missing. +# object_end_date is never missing. +# *** medium is missing 0% of values, see 854659 +# dimensions is missing 13% of values, see 856203 +# *** credit_line is missing 0% of values, see 853970 +# geography_type is missing 84% of values, see 860873 +# city is missing 93% of values, see 860873 +# state is missing 99% of values, see 860873 +# county is missing 97% of values, see 860873 +# country is missing 81% of values, see 860873 +# region is missing 92% of values, see 860873 +# subregion is missing 94% of values, see 860873 +# locale is missing 96% of values, see 860873 +# locus is missing 98% of values, see 860873 +# excavation is missing 96% of values, see 860873 +# river is missing 99% of values, see 860873 +# *** classification is missing 13% of values, see 854659 +# rights_and_reproduction is missing 99% of values, see 860873 +# *** link_resource is never missing. +# object_wikidata_url is missing 93% of values, see 860873 +# metadata_date is always missing. +# --> repository is never missing, but 1 val: 'Metropolitan Museum of Art, New York, NY' +# *** tags is missing 41% of values, see 857718 +# tags_aat_url is missing 41% of values, see 857718 +# tags_wikidata_url is missing 41% of values, see 857718 + +new_column_names = {col: col.lower().replace(" ", "_") for col in raw_data.columns} +raw_data.rename(columns=new_column_names, inplace=True) + +print("--> Column names:", "\n".join(raw_data.columns), sep="\n") +print("--> Licensing:", raw_data.is_public_domain.value_counts(dropna=False), sep="\n") + +raw_data.set_index("object_id", inplace=True) +raw_data.drop(raw_data[raw_data["is_public_domain"] == 0].index, inplace=True) +raw_data.drop(["is_public_domain"], axis=1, inplace=True) + +total_records = len(raw_data) +print(f"--> Total CC0 records: {total_records}") +print("--> Percent populated for different fields...") +sample_cases = set() +for c in raw_data.columns: + m = raw_data[c].isna().sum() + if m == total_records: + print(f"{c} is always missing.") + elif m > 0: + example = max(raw_data[raw_data[c].isna()].index) + sample_cases.add(example) + print(f"{c} is missing {int(100.0*m/total_records)}% of values, see {example}") + else: + print(f"{c} is never missing.") + +# add existing test cases +sample_cases = sample_cases.union([45733, 45734, 1027]) +# from https://github.com/WordPress/openverse-catalog/issues/641 +sample_cases = sample_cases.union([855619, 479, 747045, 5915, 4542]) + +for object_id in sample_cases: + print(f"Requesting {object_id}... ", end="") + file_name = f"sample_response_{object_id}.json" + object_url = ( + f"https://collectionapi.metmuseum.org/public/collection/v1/objects/{object_id}" + ) + response = requests.get(object_url) + if response.json(): + with open(DIR / file_name, "w") as f: + f.write(response.text) + print(":) YAY!") + else: + print(":( no response") + sleep(5) + +print("handle missing titles...") +print( + raw_data[["title", "object_name", "classification", "artist_display_name"]].query( + "not(title == title)" + ) +) + +print("artist display names...") +print( + raw_data.query( + "artist_display_name == 'Unidentified' " + + "or artist_display_name == 'Unidentified artist'" + ).artist_display_name.value_counts() +) + +print("medium...") +print(raw_data.medium.value_counts()) From c42c8bcf933b64209b1b612a2389291ccf838762 Mon Sep 17 00:00:00 2001 From: rwidom Date: Wed, 17 Aug 2022 19:53:51 -0400 Subject: [PATCH 07/28] artist name, title, and commenting --- .../metropolitan_museum.py | 52 ++++++++---- .../sample_additional_image_data.json | 3 + .../sample_image_data.json | 3 +- .../sample_response_1027_not_cc0.json | 79 ------------------- .../test_metropolitan_museum.py | 37 ++++++--- 5 files changed, 66 insertions(+), 108 deletions(-) delete mode 100644 tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_response_1027_not_cc0.json diff --git a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py index f2f56f346..af1df00da 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py @@ -51,9 +51,11 @@ def __init__(self, date: str = None): if self.date: self.query_param = {"metadataDate": date} - # this seems like useful information to track, rather than logging individually - self.object_ids_retrieved = 0 - self.non_cc0_objects = 0 + # this seems like useful information to track for context on the existing load + # metrics, but just adding them to the log in aggregate for now rather than + # logging each record individually or doing something fancier in airflow. + self.object_ids_retrieved = 0 # total object IDs based on date + self.non_cc0_objects = 0 # number checked and ignored because of licensing def get_next_query_params(self, prev_query_params=None): return self.query_param @@ -68,7 +70,6 @@ def get_batch_data(self, response_json): return None def get_record_data(self, object_id): - object_endpoint = f"{self.endpoint}/{object_id}" object_json = self.delayed_requester.get_response_json( object_endpoint, self.retries @@ -84,17 +85,25 @@ def get_record_data(self, object_id): other_images = object_json.get("additionalImages", []) image_list = [main_image] + other_images - meta_data = self._create_meta_data(object_json) - raw_tags = self._create_tag_list(object_json) + meta_data = self._get_meta_data(object_json) + raw_tags = self._get_tag_list(object_json) + title = self._get_title(object_json) + artist = self._get_artist_name(object_json) + + # We aren't currently populating creator_url. In theory we could url encode + # f"https://collectionapi.metmuseum.org/public/collection/v1/search?artistOrCulture={artist}" + # per API guide here: https://metmuseum.github.io/#search + # but it seems fairly buggy (i.e. nonresponsive), at least when tested with + # "Chelsea Porcelain Manufactory" and "Minton(s)" and "Jean Pucelle" return [ { "foreign_landing_url": object_json.get("objectURL"), "image_url": img, "license_info": self.DEFAULT_LICENSE_INFO, - "foreign_identifier": self._build_foreign_id(object_id, img), - "creator": object_json.get("artistDisplayName"), - "title": self._create_title(object_json), + "foreign_identifier": self._get_foreign_id(object_id, img), + "creator": artist, + "title": title, "meta_data": meta_data, "filetype": img.split(".")[-1], "raw_tags": raw_tags, @@ -102,11 +111,11 @@ def get_record_data(self, object_id): for img in image_list ] - def _build_foreign_id(self, object_id: int, image_url: str): + def _get_foreign_id(self, object_id: int, image_url: str): unique_identifier = image_url.split("/")[-1].split(".")[0] return f"{object_id}-{unique_identifier}" - def _create_meta_data(self, object_json): + def _get_meta_data(self, object_json): meta_data = None if object_json.get("accessionNumber") is not None: meta_data = { @@ -114,7 +123,7 @@ def _create_meta_data(self, object_json): } return meta_data - def _create_tag_list(self, object_json): + def _get_tag_list(self, object_json): tag_list = [ tag for tag in [ @@ -122,23 +131,34 @@ def _create_tag_list(self, object_json): object_json.get("medium"), object_json.get("culture"), object_json.get("objectName"), - object_json.get("artistDisplayName"), + self._get_artist_name(object_json), object_json.get("classification"), object_json.get("objectDate"), object_json.get("creditLine"), + object_json.get("period"), ] - if tag is not None and tag != "" + if tag ] - if object_json.get("tags") is not None and object_json.get("tags") != "": + if object_json.get("tags"): tag_list += [tag["term"] for tag in object_json.get("tags")] return tag_list - def _create_title(self, record): + def _get_title(self, record): if record.get("title"): return record.get("title") else: return record.get("objectName") + def _get_artist_name(self, record): + artist = record.get("artistDisplayName") + # Treating "unidentified" the same as missing, but maybe it would be useful in + # in search? Maybe in the art world it has a more specific outsider art + # connotation? + if artist in ["Unidentified", "Unidentified artist"]: + return None + else: + return artist + def get_media_type(self, record): # This provider only supports Images. return "image" diff --git a/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_additional_image_data.json b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_additional_image_data.json index 7da440276..7c6ea7386 100644 --- a/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_additional_image_data.json +++ b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_additional_image_data.json @@ -21,6 +21,7 @@ "Paintings", "late 17th century", "The Howard Mansfield Collection, Purchase, Rogers Fund, 1936", + "Edo period (1615\u20131868)", "Birds", "Leaves" ], @@ -48,6 +49,7 @@ "Paintings", "late 17th century", "The Howard Mansfield Collection, Purchase, Rogers Fund, 1936", + "Edo period (1615\u20131868)", "Birds", "Leaves" ], @@ -75,6 +77,7 @@ "Paintings", "late 17th century", "The Howard Mansfield Collection, Purchase, Rogers Fund, 1936", + "Edo period (1615\u20131868)", "Birds", "Leaves" ], diff --git a/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_image_data.json b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_image_data.json index 380443930..403b2680a 100644 --- a/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_image_data.json +++ b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_image_data.json @@ -18,7 +18,8 @@ "China", "Cover", "Ceramics", - "Purchase by subscription, 1879" + "Purchase by subscription, 1879", + "Qing dynasty (1644\u20131911), Kangxi period (1662\u20131722)" ], "title": "Cover" } diff --git a/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_response_1027_not_cc0.json b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_response_1027_not_cc0.json deleted file mode 100644 index d8e46cbd6..000000000 --- a/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/sample_response_1027_not_cc0.json +++ /dev/null @@ -1,79 +0,0 @@ -{ - "GalleryNumber": "706", - "accessionNumber": "67.35.2", - "accessionYear": "1967", - "additionalImages": [], - "artistAlphaSort": "Unger Brothers", - "artistBeginDate": "1872", - "artistDisplayBio": "1872\u20131919", - "artistDisplayName": "Unger Brothers", - "artistEndDate": "1919", - "artistGender": "", - "artistNationality": "", - "artistPrefix": "", - "artistRole": "Maker", - "artistSuffix": "", - "artistULAN_URL": "http://vocab.getty.edu/page/ulan/500490017", - "artistWikidata_URL": "https://www.wikidata.org/wiki/Q98065799", - "city": "Newark", - "classification": "", - "constituents": [ - { - "constituentID": 162583, - "constituentULAN_URL": "http://vocab.getty.edu/page/ulan/500490017", - "constituentWikidata_URL": "https://www.wikidata.org/wiki/Q98065799", - "gender": "", - "name": "Unger Brothers", - "role": "Maker" - } - ], - "country": "United States", - "county": "", - "creditLine": "Gift of Ronald S. Kane, 1967", - "culture": "American", - "department": "The American Wing", - "dimensions": "11/16 x 2 7/16 x 2 5/8 in. (1.7 x 6.2 x 6.7 cm); 2 dwt. (28.7 g)", - "dynasty": "", - "excavation": "", - "geographyType": "Made in", - "isHighlight": false, - "isPublicDomain": false, - "isTimelineWork": false, - "linkResource": "", - "locale": "", - "locus": "", - "measurements": [ - { - "elementDescription": null, - "elementMeasurements": { - "Depth": 6.6675, - "Height": 1.7463, - "Weight": 0.0287, - "Width": 6.1913 - }, - "elementName": "Overall" - } - ], - "medium": "Silver", - "metadataDate": "2022-08-12T04:53:21.933Z", - "objectBeginDate": 1900, - "objectDate": "ca. 1900", - "objectEndDate": 1905, - "objectID": 1027, - "objectName": "Brooch", - "objectURL": "https://www.metmuseum.org/art/collection/search/1027", - "objectWikidata_URL": "", - "period": "", - "portfolio": "", - "primaryImage": "", - "primaryImageSmall": "", - "region": "Mid-Atlantic", - "reign": "", - "repository": "Metropolitan Museum of Art, New York, NY", - "rightsAndReproduction": "", - "river": "", - "state": "New Jersey", - "subregion": "", - "tags": null, - "title": "Brooch" -} diff --git a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py index a59e04564..efa6372a9 100644 --- a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py +++ b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py @@ -33,11 +33,6 @@ (RESOURCES / "sample_additional_image_data.json").read_text() ) -# Not CC0 -ineligible_response = json.loads( - (RESOURCES / "sample_response_1027_not_cc0.json").read_text() -) - CC0 = LicenseInfo( "cc0", "1.0", "https://creativecommons.org/publicdomain/zero/1.0/", None @@ -77,8 +72,8 @@ def test_get_batch_data(response_json, expected): ("multi images", full_object_response, full_expected_data[0].get("meta_data")), ], ) -def test_create_meta_data(case_name, response_json, expected): - actual = mma._create_meta_data(response_json) +def test_get_meta_data(case_name, response_json, expected): + actual = mma._get_meta_data(response_json) assert expected == actual @@ -93,8 +88,8 @@ def test_create_meta_data(case_name, response_json, expected): ("multi images", full_object_response, full_expected_data[0].get("raw_tags")), ], ) -def test_create_tag_list(case_name, response_json, expected): - actual = mma._create_tag_list(response_json) +def test_get_tag_list(case_name, response_json, expected): + actual = mma._get_tag_list(response_json) assert expected == actual @@ -106,8 +101,22 @@ def test_create_tag_list(case_name, response_json, expected): ({"title": "", "objectName": "Yes, empty title"}, "Yes, empty title"), ], ) -def test_create_title(response_json, expected): - actual = mma._create_title(response_json) +def test_get_title(response_json, expected): + actual = mma._get_title(response_json) + assert actual == expected + + +@pytest.mark.parametrize( + "response_json, expected", + [ + ({}, None), + ({"artistDisplayName": "Unidentified"}, None), + ({"artistDisplayName": "Unidentified artist"}, None), + ({"artistDisplayName": "Unidentified flying obj"}, "Unidentified flying obj"), + ], +) +def test_get_artist_name(response_json, expected): + actual = mma._get_artist_name(response_json) assert actual == expected @@ -133,7 +142,11 @@ def test_get_record_data_with_non_ok(): [ ("single image", single_object_response, single_expected_data), ("multi image", full_object_response, full_expected_data), - ("not cc0", ineligible_response, None), + ( + "not cc0", + json.loads('{"isPublicDomain": false, "otherData": "is here too"}'), + None, + ), ], ) def test_get_record_data_returns_response_json_when_all_ok( From 75aaf10d4aec8a3e421ff80ae6580a3e0f26a0ab Mon Sep 17 00:00:00 2001 From: rwidom Date: Fri, 19 Aug 2022 07:57:55 -0400 Subject: [PATCH 08/28] Use the image store for filetype Co-authored-by: Olga Bulat --- .../dags/providers/provider_api_scripts/metropolitan_museum.py | 1 - 1 file changed, 1 deletion(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py index af1df00da..6d8dad540 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py @@ -105,7 +105,6 @@ def get_record_data(self, object_id): "creator": artist, "title": title, "meta_data": meta_data, - "filetype": img.split(".")[-1], "raw_tags": raw_tags, } for img in image_list From 16d51e0f74ac64499b89f2cdb8eebda640f598c1 Mon Sep 17 00:00:00 2001 From: rwidom Date: Fri, 19 Aug 2022 17:31:55 -0400 Subject: [PATCH 09/28] add creator url options to comments --- .../providers/provider_api_scripts/metropolitan_museum.py | 2 ++ .../resources/metropolitan_museum_of_art/read_met_csv.py | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py index 6d8dad540..3153d3777 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py @@ -95,6 +95,8 @@ def get_record_data(self, object_id): # per API guide here: https://metmuseum.github.io/#search # but it seems fairly buggy (i.e. nonresponsive), at least when tested with # "Chelsea Porcelain Manufactory" and "Minton(s)" and "Jean Pucelle" + # Should we use artistWikidata_URL or artistULAN_URL? They're populated approx + # 65% of the time. return [ { diff --git a/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/read_met_csv.py b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/read_met_csv.py index a4e557a96..7bc90be2e 100644 --- a/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/read_met_csv.py +++ b/tests/dags/providers/provider_api_scripts/resources/metropolitan_museum_of_art/read_met_csv.py @@ -36,8 +36,8 @@ "Tags", "Period", "Object Date", - "artistWikidata_URL", - "artistULAN_URL", + "Artist Wikidata URL", + "Artist ULAN URL", ], ) @@ -104,6 +104,10 @@ raw_data.drop(raw_data[raw_data["is_public_domain"] == 0].index, inplace=True) raw_data.drop(["is_public_domain"], axis=1, inplace=True) +raw_data["possible_creator_url"] = raw_data.artist_ulan_url.fillna( + raw_data.artist_wikidata_url +) + total_records = len(raw_data) print(f"--> Total CC0 records: {total_records}") print("--> Percent populated for different fields...") From ae996545b37d0fea88658c9a8326732930ffaeac Mon Sep 17 00:00:00 2001 From: rwidom Date: Fri, 19 Aug 2022 17:35:38 -0400 Subject: [PATCH 10/28] use set not list with in --- .../dags/providers/provider_api_scripts/metropolitan_museum.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py index 3153d3777..3ff0fdd32 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py @@ -155,7 +155,7 @@ def _get_artist_name(self, record): # Treating "unidentified" the same as missing, but maybe it would be useful in # in search? Maybe in the art world it has a more specific outsider art # connotation? - if artist in ["Unidentified", "Unidentified artist"]: + if artist in {"Unidentified", "Unidentified artist"}: return None else: return artist From 6ee7f874aebf4495c686535bc302bb536fd9d15e Mon Sep 17 00:00:00 2001 From: rwidom Date: Fri, 19 Aug 2022 17:47:20 -0400 Subject: [PATCH 11/28] move api url commentary to the doc string --- .../metropolitan_museum.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py index 3ff0fdd32..e5948dcac 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py @@ -13,6 +13,16 @@ Some analysis to improve data quality was conducted using a separate csv file here: https://github.com/metmuseum/openaccess + + Get a list of object IDs: + https://collectionapi.metmuseum.org/public/collection/v1/objects?metadataDate=2022-08-10 + Get a specific object: + https://collectionapi.metmuseum.org/public/collection/v1/objects/1027 + The search functionality requires a specific query (term search) + in addition to date and public domain. It seems like it won't + connect with just date and license. + https://collectionapi.metmuseum.org/public/collection/v1/search?isPublicDomain=true&metadataDate=2022-08-07 + """ import argparse @@ -28,14 +38,6 @@ ) logger = logging.getLogger(__name__) -# get a list of object IDs for this week: -# https://collectionapi.metmuseum.org/public/collection/v1/objects?metadataDate=2022-08-10 -# get a specific object: -# https://collectionapi.metmuseum.org/public/collection/v1/objects/1027 -# The search functionality requires a specific query (term search) in addition to date -# and public domain. It seems like it won't connect with just date and license. -# https://collectionapi.metmuseum.org/public/collection/v1/search?isPublicDomain=true&metadataDate=2022-08-07 - class MetMuseumDataIngester(ProviderDataIngester): providers = {"image": prov.MET_MUSEUM_DEFAULT_PROVIDER} From f4319326ea8542b02f288efbdb09e7463959f3d0 Mon Sep 17 00:00:00 2001 From: rwidom Date: Fri, 19 Aug 2022 17:51:02 -0400 Subject: [PATCH 12/28] keep default delay --- .../providers/provider_api_scripts/metropolitan_museum.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py index e5948dcac..0c0bc5e74 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py @@ -7,9 +7,9 @@ meta-data. Notes: https://metmuseum.github.io/#search - "Please limit requests to 80 requests per second." Changing - delay to 3 seconds, because of blocking encountered during - development. + "Please limit requests to 80 requests per second." May need to + bump up the delay (e.g. to 3 seconds), to avoid of blocking + during local development testing. Some analysis to improve data quality was conducted using a separate csv file here: https://github.com/metmuseum/openaccess @@ -47,7 +47,6 @@ class MetMuseumDataIngester(ProviderDataIngester): def __init__(self, date: str = None): super(MetMuseumDataIngester, self).__init__(date=date) self.retries = 5 - self.delayed_requester._DELAY = 3 self.query_param = None if self.date: From 67f667309d3a8444e7be816a11ee83e65fcdeba5 Mon Sep 17 00:00:00 2001 From: rwidom Date: Fri, 19 Aug 2022 17:54:52 -0400 Subject: [PATCH 13/28] more concise if no title, use object name Co-authored-by: Madison Swain-Bowden --- .../providers/provider_api_scripts/metropolitan_museum.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py index 0c0bc5e74..21cc39d13 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py @@ -146,10 +146,7 @@ def _get_tag_list(self, object_json): return tag_list def _get_title(self, record): - if record.get("title"): - return record.get("title") - else: - return record.get("objectName") + return record.get("title", record.get("objectName")) def _get_artist_name(self, record): artist = record.get("artistDisplayName") From 8ac2941cfda3abbba85f7d3fc88037fd4b3c4429 Mon Sep 17 00:00:00 2001 From: rwidom Date: Fri, 19 Aug 2022 17:56:09 -0400 Subject: [PATCH 14/28] remove unnecessary code from debugging Co-authored-by: Madison Swain-Bowden --- .../providers/provider_api_scripts/test_metropolitan_museum.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py index efa6372a9..1d244ddc3 100644 --- a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py +++ b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py @@ -17,7 +17,6 @@ mma = MetMuseumDataIngester() RESOURCES = Path(__file__).parent / "resources/metropolitan_museum_of_art" -print(RESOURCES) # abbreviated response without other images 45733 single_object_response = json.loads( From b54f605734172501d5ec9420794ddff5d7521d85 Mon Sep 17 00:00:00 2001 From: rwidom Date: Fri, 19 Aug 2022 18:01:47 -0400 Subject: [PATCH 15/28] use zip to make loops more legible Co-authored-by: Madison Swain-Bowden --- .../provider_api_scripts/test_metropolitan_museum.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py index 1d244ddc3..895ccb8ff 100644 --- a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py +++ b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py @@ -160,9 +160,9 @@ def test_get_record_data_returns_response_json_when_all_ok( assert actual is None else: assert len(actual) == len(expected) - for i in range(len(actual)): - for key, value in expected[i].items(): + for actual_result, expected_result in zip(actual, expected): + for key, value in expected_result.items(): if key == "license_info": - assert actual[i].get(key) == CC0 + assert actual_result.get(key) == CC0 else: - assert actual[i].get(key) == value + assert actual_result.get(key) == value From bc55683fe7ef0e54c506644762a03e9618c6609b Mon Sep 17 00:00:00 2001 From: rwidom Date: Fri, 19 Aug 2022 18:21:05 -0400 Subject: [PATCH 16/28] use if/else to skip falsey titles --- .../providers/provider_api_scripts/metropolitan_museum.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py index 21cc39d13..4bbca6c8b 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py @@ -146,7 +146,11 @@ def _get_tag_list(self, object_json): return tag_list def _get_title(self, record): - return record.get("title", record.get("objectName")) + # Use if/else here to skip false-y (empty) titles: "" + if record.get("title"): + return record.get("title") + else: + return record.get("objectName") def _get_artist_name(self, record): artist = record.get("artistDisplayName") From 5cca07ba20cb89721382bb972039cbbe156cfa7a Mon Sep 17 00:00:00 2001 From: rwidom Date: Fri, 19 Aug 2022 18:23:08 -0400 Subject: [PATCH 17/28] keep production start date --- openverse_catalog/dags/providers/provider_workflows.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/openverse_catalog/dags/providers/provider_workflows.py b/openverse_catalog/dags/providers/provider_workflows.py index 9cc62b8ae..4fc42858a 100644 --- a/openverse_catalog/dags/providers/provider_workflows.py +++ b/openverse_catalog/dags/providers/provider_workflows.py @@ -118,9 +118,7 @@ def __post_init__(self): ProviderWorkflow( provider_script="metropolitan_museum", ingester_class=MetMuseumDataIngester, - # changed for testing - # start_date=datetime(2016, 9, 1), - start_date=datetime(2022, 8, 8), + start_date=datetime(2016, 9, 1), schedule_string="@daily", dated=True, pull_timeout=timedelta(hours=12), From 57ec40ee74b0000cf59cd95bcd54d02683767094 Mon Sep 17 00:00:00 2001 From: rwidom Date: Fri, 19 Aug 2022 18:30:19 -0400 Subject: [PATCH 18/28] regenerate dag docs --- DAGs.md | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/DAGs.md b/DAGs.md index 77e92ae9a..bae0c526d 100644 --- a/DAGs.md +++ b/DAGs.md @@ -350,13 +350,23 @@ Output: TSV file containing the image, their respective meta-data. Notes: https://metmuseum.github.io/#search - "Please limit requests to 80 requests per second." Changing - delay to 3 seconds, because of blocking encountered during - development. + "Please limit requests to 80 requests per second." May need to + bump up the delay (e.g. to 3 seconds), to avoid of blocking + during local development testing. Some analysis to improve data quality was conducted using a separate csv file here: https://github.com/metmuseum/openaccess + Get a list of object IDs: + https://collectionapi.metmuseum.org/public/collection/v1/objects?metadataDate=2022-08-10 + Get a specific object: + https://collectionapi.metmuseum.org/public/collection/v1/objects/1027 + The search functionality requires a specific query (term search) + in addition to date and public domain. It seems like it won't + connect with just date and license. + https://collectionapi.metmuseum.org/public/collection/v1/search?isPublicDomain=true&metadataDate=2022-08-07 + + ## `oauth2_authorization` From 1c2e0f5a318561957efaf5852d6f297d17d48dde Mon Sep 17 00:00:00 2001 From: rwidom Date: Fri, 19 Aug 2022 19:18:30 -0400 Subject: [PATCH 19/28] adding none and {} tests --- .../metropolitan_museum.py | 25 +++++----- .../test_metropolitan_museum.py | 47 ++++++++----------- 2 files changed, 32 insertions(+), 40 deletions(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py index 4bbca6c8b..708fa4bb6 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py @@ -118,14 +118,16 @@ def _get_foreign_id(self, object_id: int, image_url: str): return f"{object_id}-{unique_identifier}" def _get_meta_data(self, object_json): - meta_data = None - if object_json.get("accessionNumber") is not None: - meta_data = { + if object_json is None: + return + if object_json.get("accessionNumber"): + return { "accession_number": object_json.get("accessionNumber"), } - return meta_data def _get_tag_list(self, object_json): + if object_json is None: + return tag_list = [ tag for tag in [ @@ -146,21 +148,18 @@ def _get_tag_list(self, object_json): return tag_list def _get_title(self, record): + if record is None: + return # Use if/else here to skip false-y (empty) titles: "" - if record.get("title"): + elif record.get("title"): return record.get("title") else: return record.get("objectName") def _get_artist_name(self, record): - artist = record.get("artistDisplayName") - # Treating "unidentified" the same as missing, but maybe it would be useful in - # in search? Maybe in the art world it has a more specific outsider art - # connotation? - if artist in {"Unidentified", "Unidentified artist"}: - return None - else: - return artist + if record is None: + return + return record.get("artistDisplayName") def get_media_type(self, record): # This provider only supports Images. diff --git a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py index 895ccb8ff..68a16f557 100644 --- a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py +++ b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py @@ -61,33 +61,29 @@ def test_get_batch_data(response_json, expected): @pytest.mark.parametrize( - "case_name, response_json, expected", + "response_json, expected", [ - ( - "single image", - single_object_response, - single_expected_data[0].get("meta_data"), - ), - ("multi images", full_object_response, full_expected_data[0].get("meta_data")), + (single_object_response, single_expected_data[0].get("meta_data")), + (full_object_response, full_expected_data[0].get("meta_data")), + ({}, None), + (None, None), ], ) -def test_get_meta_data(case_name, response_json, expected): +def test_get_meta_data(response_json, expected): actual = mma._get_meta_data(response_json) assert expected == actual @pytest.mark.parametrize( - "case_name, response_json, expected", + "response_json, expected", [ - ( - "single image", - single_object_response, - single_expected_data[0].get("raw_tags"), - ), - ("multi images", full_object_response, full_expected_data[0].get("raw_tags")), + (single_object_response, single_expected_data[0].get("raw_tags")), + (full_object_response, full_expected_data[0].get("raw_tags")), + ({}, []), + (None, None), ], ) -def test_get_tag_list(case_name, response_json, expected): +def test_get_tag_list(response_json, expected): actual = mma._get_tag_list(response_json) assert expected == actual @@ -98,6 +94,8 @@ def test_get_tag_list(case_name, response_json, expected): ({"title": "Yes, regular case", "objectName": "Wrong"}, "Yes, regular case"), ({"objectName": "Yes, no title at all"}, "Yes, no title at all"), ({"title": "", "objectName": "Yes, empty title"}, "Yes, empty title"), + ({}, None), + (None, None), ], ) def test_get_title(response_json, expected): @@ -109,8 +107,7 @@ def test_get_title(response_json, expected): "response_json, expected", [ ({}, None), - ({"artistDisplayName": "Unidentified"}, None), - ({"artistDisplayName": "Unidentified artist"}, None), + (None, None), ({"artistDisplayName": "Unidentified flying obj"}, "Unidentified flying obj"), ], ) @@ -137,19 +134,15 @@ def test_get_record_data_with_non_ok(): @pytest.mark.parametrize( - "case_name, response_json, expected", + "response_json, expected", [ - ("single image", single_object_response, single_expected_data), - ("multi image", full_object_response, full_expected_data), - ( - "not cc0", - json.loads('{"isPublicDomain": false, "otherData": "is here too"}'), - None, - ), + (single_object_response, single_expected_data), + (full_object_response, full_expected_data), + (json.loads('{"isPublicDomain": false, "otherData": "is here too"}'), None), ], ) def test_get_record_data_returns_response_json_when_all_ok( - case_name, response_json, expected, monkeypatch + response_json, expected, monkeypatch ): monkeypatch.setattr( mma.delayed_requester, "get_response_json", lambda x, y: response_json From e441b7962f5802de6245700a03c8fc7168b7c189 Mon Sep 17 00:00:00 2001 From: rwidom Date: Fri, 19 Aug 2022 19:53:27 -0400 Subject: [PATCH 20/28] add test IDs for complex paramters --- .../test_metropolitan_museum.py | 35 +++++++++++++++---- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py index 68a16f557..a72971eef 100644 --- a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py +++ b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py @@ -53,6 +53,7 @@ def test_get_next_query_params(test_date, expected): [ ({"total": 4, "objectIDs": [153, 1578, 465, 546]}, [153, 1578, 465, 546]), (None, None), + ({}, None), ], ) def test_get_batch_data(response_json, expected): @@ -63,8 +64,16 @@ def test_get_batch_data(response_json, expected): @pytest.mark.parametrize( "response_json, expected", [ - (single_object_response, single_expected_data[0].get("meta_data")), - (full_object_response, full_expected_data[0].get("meta_data")), + pytest.param( + single_object_response, + single_expected_data[0].get("meta_data"), + id="single_image", + ), + pytest.param( + full_object_response, + full_expected_data[0].get("meta_data"), + id="full_object", + ), ({}, None), (None, None), ], @@ -77,8 +86,16 @@ def test_get_meta_data(response_json, expected): @pytest.mark.parametrize( "response_json, expected", [ - (single_object_response, single_expected_data[0].get("raw_tags")), - (full_object_response, full_expected_data[0].get("raw_tags")), + pytest.param( + single_object_response, + single_expected_data[0].get("raw_tags"), + id="single_image", + ), + pytest.param( + full_object_response, + full_expected_data[0].get("raw_tags"), + id="full_object", + ), ({}, []), (None, None), ], @@ -136,9 +153,13 @@ def test_get_record_data_with_non_ok(): @pytest.mark.parametrize( "response_json, expected", [ - (single_object_response, single_expected_data), - (full_object_response, full_expected_data), - (json.loads('{"isPublicDomain": false, "otherData": "is here too"}'), None), + pytest.param(single_object_response, single_expected_data, id="single_image"), + pytest.param(full_object_response, full_expected_data, id="full_object"), + pytest.param( + json.loads('{"isPublicDomain": false, "otherData": "is here too"}'), + None, + id="not_cc0", + ), ], ) def test_get_record_data_returns_response_json_when_all_ok( From e78b6d990f95aedfecd0c29d2a88a101f434f30f Mon Sep 17 00:00:00 2001 From: rwidom Date: Fri, 19 Aug 2022 19:59:48 -0400 Subject: [PATCH 21/28] use or to make json navigation more concise --- .../provider_api_scripts/metropolitan_museum.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py index 708fa4bb6..596fa4ebc 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py @@ -148,13 +148,9 @@ def _get_tag_list(self, object_json): return tag_list def _get_title(self, record): - if record is None: - return - # Use if/else here to skip false-y (empty) titles: "" - elif record.get("title"): - return record.get("title") - else: - return record.get("objectName") + if record is not None: + # Use or to skip false-y (empty) titles: "" + return record.get("title") or record.get("objectName") def _get_artist_name(self, record): if record is None: From 8bec026c591cb49afe0e2112d35553310db5bee6 Mon Sep 17 00:00:00 2001 From: rwidom Date: Fri, 19 Aug 2022 20:39:52 -0400 Subject: [PATCH 22/28] add args for class param --- .../dags/providers/provider_api_scripts/metropolitan_museum.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py index 596fa4ebc..aface9bdc 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py @@ -44,7 +44,8 @@ class MetMuseumDataIngester(ProviderDataIngester): endpoint = "https://collectionapi.metmuseum.org/public/collection/v1/objects" DEFAULT_LICENSE_INFO = get_license_info(license_="cc0", license_version="1.0") - def __init__(self, date: str = None): + # adding args for automatically generated parameters from generate_tsv_filenames + def __init__(self, date: str = None, *args): super(MetMuseumDataIngester, self).__init__(date=date) self.retries = 5 From 248b1a19ee3bdb8076c51aa8316a78476be2ce7e Mon Sep 17 00:00:00 2001 From: rwidom Date: Mon, 22 Aug 2022 17:39:54 -0400 Subject: [PATCH 23/28] name config parameters explicitly --- .../providers/provider_api_scripts/metropolitan_museum.py | 6 +++--- .../provider_api_scripts/test_metropolitan_museum.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py index aface9bdc..860b7c88a 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py @@ -45,12 +45,12 @@ class MetMuseumDataIngester(ProviderDataIngester): DEFAULT_LICENSE_INFO = get_license_info(license_="cc0", license_version="1.0") # adding args for automatically generated parameters from generate_tsv_filenames - def __init__(self, date: str = None, *args): - super(MetMuseumDataIngester, self).__init__(date=date) + def __init__(self, conf: dict = None, date: str = None): + super(MetMuseumDataIngester, self).__init__(conf=conf, date=date) self.retries = 5 self.query_param = None - if self.date: + if date: self.query_param = {"metadataDate": date} # this seems like useful information to track for context on the existing load diff --git a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py index a72971eef..e884ed2da 100644 --- a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py +++ b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py @@ -43,7 +43,7 @@ [("2022-07-01", {"metadataDate": "2022-07-01"}), (None, None)], ) def test_get_next_query_params(test_date, expected): - ingester = MetMuseumDataIngester(test_date) + ingester = MetMuseumDataIngester(date=test_date) actual = ingester.get_next_query_params() assert actual == expected From 90d3727dad72268b7378367414fd9d9178ff8b22 Mon Sep 17 00:00:00 2001 From: rwidom Date: Mon, 22 Aug 2022 17:43:42 -0400 Subject: [PATCH 24/28] spell out metropolitan in default provider const. --- openverse_catalog/dags/common/loader/provider_details.py | 2 +- .../dags/providers/provider_api_scripts/metropolitan_museum.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/openverse_catalog/dags/common/loader/provider_details.py b/openverse_catalog/dags/common/loader/provider_details.py index f041eed59..e9e6eab49 100644 --- a/openverse_catalog/dags/common/loader/provider_details.py +++ b/openverse_catalog/dags/common/loader/provider_details.py @@ -20,7 +20,7 @@ SMITHSONIAN_DEFAULT_PROVIDER = "smithsonian" BROOKLYN_DEFAULT_PROVIDER = "brooklynmuseum" CLEVELAND_DEFAULT_PROVIDER = "clevelandmuseum" -MET_MUSEUM_DEFAULT_PROVIDER = "met" +METROPOLITAN_MUSEUM_DEFAULT_PROVIDER = "met" VICTORIA_DEFAULT_PROVIDER = "museumsvictoria" NYPL_DEFAULT_PROVIDER = "nypl" RAWPIXEL_DEFAULT_PROVIDER = "rawpixel" diff --git a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py index 860b7c88a..2aaec5ac7 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py @@ -40,7 +40,7 @@ class MetMuseumDataIngester(ProviderDataIngester): - providers = {"image": prov.MET_MUSEUM_DEFAULT_PROVIDER} + providers = {"image": prov.METROPOLITAN_MUSEUM_DEFAULT_PROVIDER} endpoint = "https://collectionapi.metmuseum.org/public/collection/v1/objects" DEFAULT_LICENSE_INFO = get_license_info(license_="cc0", license_version="1.0") From 1dcc93d63ec7ede06f97f5b6a84ef1e62964d2e2 Mon Sep 17 00:00:00 2001 From: rwidom Date: Wed, 24 Aug 2022 16:57:25 -0400 Subject: [PATCH 25/28] clarify objects vs records --- .../metropolitan_museum.py | 19 +++++++++------- .../test_metropolitan_museum.py | 22 ++++++++++++++----- 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py index 2aaec5ac7..76d14ba3e 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py @@ -65,6 +65,9 @@ def get_next_query_params(self, prev_query_params=None): def get_batch_data(self, response_json): if response_json: self.object_ids_retrieved = response_json["total"] + # A single objet d'art may have more than one image (and therefore more + # than one record) associated with it, but there are generally on the order + # of 10 or fewer records per object. logger.info(f"Total objects found {self.object_ids_retrieved}") return response_json["objectIDs"] else: @@ -80,7 +83,7 @@ def get_record_data(self, object_id): if object_json.get("isPublicDomain") is False: self.non_cc0_objects += 1 if self.non_cc0_objects % self.batch_limit == 0: - logger.info(f"Retrieved {self.non_cc0_objects} non-CC0 records.") + logger.info(f"Retrieved {self.non_cc0_objects} non-CC0 objects.") return None main_image = object_json.get("primaryImage") @@ -148,17 +151,17 @@ def _get_tag_list(self, object_json): tag_list += [tag["term"] for tag in object_json.get("tags")] return tag_list - def _get_title(self, record): - if record is not None: + def _get_title(self, object_json): + if object_json is not None: # Use or to skip false-y (empty) titles: "" - return record.get("title") or record.get("objectName") + return object_json.get("title") or object_json.get("objectName") - def _get_artist_name(self, record): - if record is None: + def _get_artist_name(self, object_json): + if object_json is None: return - return record.get("artistDisplayName") + return object_json.get("artistDisplayName") - def get_media_type(self, record): + def get_media_type(self, object_json): # This provider only supports Images. return "image" diff --git a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py index e884ed2da..38f1409cd 100644 --- a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py +++ b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py @@ -108,11 +108,23 @@ def test_get_tag_list(response_json, expected): @pytest.mark.parametrize( "response_json, expected", [ - ({"title": "Yes, regular case", "objectName": "Wrong"}, "Yes, regular case"), - ({"objectName": "Yes, no title at all"}, "Yes, no title at all"), - ({"title": "", "objectName": "Yes, empty title"}, "Yes, empty title"), - ({}, None), - (None, None), + pytest.param( + {"title": "Yes, regular case", "objectName": "Wrong"}, + "Yes, regular case", + id="happy_path", + ), + pytest.param( + {"objectName": "Yes, no title at all"}, + "Yes, no title at all", + id="no_title", + ), + pytest.param( + {"title": "", "objectName": "Yes, empty title"}, + "Yes, empty title", + id="empty_string_title", + ), + pytest.param({}, None, id="empty_json"), + pytest.param(None, None, id="None"), ], ) def test_get_title(response_json, expected): From a1670e526e422fc48e4e83ef246fca3d45ebc768 Mon Sep 17 00:00:00 2001 From: rwidom Date: Wed, 24 Aug 2022 18:15:33 -0400 Subject: [PATCH 26/28] default to empty dict for query params if no date --- .../providers/provider_api_scripts/metropolitan_museum.py | 4 +++- .../provider_api_scripts/test_metropolitan_museum.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py index 76d14ba3e..73696b669 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py @@ -49,7 +49,9 @@ def __init__(self, conf: dict = None, date: str = None): super(MetMuseumDataIngester, self).__init__(conf=conf, date=date) self.retries = 5 - self.query_param = None + # Default to empty dict to avoid break in ingest_records. In general, this dag + # should not be run without a date, but don't want to completely rule it out. + self.query_param = {} if date: self.query_param = {"metadataDate": date} diff --git a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py index 38f1409cd..1a798fd8d 100644 --- a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py +++ b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py @@ -40,7 +40,7 @@ @pytest.mark.parametrize( "test_date, expected", - [("2022-07-01", {"metadataDate": "2022-07-01"}), (None, None)], + [("2022-07-01", {"metadataDate": "2022-07-01"}), (None, {}), ("", {})], ) def test_get_next_query_params(test_date, expected): ingester = MetMuseumDataIngester(date=test_date) From 3f7765ebc23bae24499b4ae6caecea2e41b9a181 Mon Sep 17 00:00:00 2001 From: rwidom Date: Thu, 25 Aug 2022 08:12:06 -0400 Subject: [PATCH 27/28] only one batch per dag run --- .../providers/provider_api_scripts/metropolitan_museum.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py index 73696b669..40d83288b 100644 --- a/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py +++ b/openverse_catalog/dags/providers/provider_api_scripts/metropolitan_museum.py @@ -119,6 +119,12 @@ def get_record_data(self, object_id): for img in image_list ] + def get_should_continue(self, response_json): + # The met museum search function does not have pagination for the initial list + # of object IDs, so after processing the first list, we're done. + if response_json: + return False + def _get_foreign_id(self, object_id: int, image_url: str): unique_identifier = image_url.split("/")[-1].split(".")[0] return f"{object_id}-{unique_identifier}" From 17d0aff9609336e9a8480c1a350e104fc2fe56ee Mon Sep 17 00:00:00 2001 From: rwidom Date: Fri, 26 Aug 2022 06:21:44 -0400 Subject: [PATCH 28/28] additional pytest parameter set names --- .../test_metropolitan_museum.py | 34 +++++++++++++------ 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py index 1a798fd8d..b0d2c503e 100644 --- a/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py +++ b/tests/dags/providers/provider_api_scripts/test_metropolitan_museum.py @@ -40,7 +40,11 @@ @pytest.mark.parametrize( "test_date, expected", - [("2022-07-01", {"metadataDate": "2022-07-01"}), (None, {}), ("", {})], + [ + pytest.param("2022-07-01", {"metadataDate": "2022-07-01"}, id="happy_path"), + pytest.param(None, {}, id="None"), + pytest.param("", {}, id="empty_string"), + ], ) def test_get_next_query_params(test_date, expected): ingester = MetMuseumDataIngester(date=test_date) @@ -51,9 +55,13 @@ def test_get_next_query_params(test_date, expected): @pytest.mark.parametrize( "response_json, expected", [ - ({"total": 4, "objectIDs": [153, 1578, 465, 546]}, [153, 1578, 465, 546]), - (None, None), - ({}, None), + pytest.param( + {"total": 4, "objectIDs": [153, 1578, 465, 546]}, + [153, 1578, 465, 546], + id="happy_path", + ), + pytest.param({}, None, id="empty_dict"), + pytest.param(None, None, id="None"), ], ) def test_get_batch_data(response_json, expected): @@ -74,8 +82,8 @@ def test_get_batch_data(response_json, expected): full_expected_data[0].get("meta_data"), id="full_object", ), - ({}, None), - (None, None), + pytest.param({}, None, id="empty_dict"), + pytest.param(None, None, id="None"), ], ) def test_get_meta_data(response_json, expected): @@ -96,8 +104,8 @@ def test_get_meta_data(response_json, expected): full_expected_data[0].get("raw_tags"), id="full_object", ), - ({}, []), - (None, None), + pytest.param({}, [], id="empty_dict"), + pytest.param(None, None, id="None"), ], ) def test_get_tag_list(response_json, expected): @@ -135,9 +143,13 @@ def test_get_title(response_json, expected): @pytest.mark.parametrize( "response_json, expected", [ - ({}, None), - (None, None), - ({"artistDisplayName": "Unidentified flying obj"}, "Unidentified flying obj"), + pytest.param({}, None, id="empty_json"), + pytest.param(None, None, id="None"), + pytest.param( + {"artistDisplayName": "Unidentified flying obj"}, + "Unidentified flying obj", + id="happy_path", + ), ], ) def test_get_artist_name(response_json, expected):