Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Collect image dimensions from Europeana #2782

Merged
merged 29 commits into from
Sep 9, 2023
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
8420d7f
adding test resources
rwidom Aug 5, 2023
8c9a9db
add individual item data pull to get dimensions
rwidom Aug 5, 2023
1013065
make sure the dimension tests actually run
rwidom Aug 7, 2023
2bba581
mock record api calls to enable test image list
rwidom Aug 7, 2023
b1b4696
separate function to get item data
rwidom Aug 9, 2023
ec40e8c
dimensions after 1st resource, use default delay
rwidom Aug 10, 2023
9d983bc
add filetype and filesize
rwidom Aug 11, 2023
0c6c083
handle more falsey item responses
rwidom Aug 15, 2023
0a79436
Merge branch 'main' into fix-1484/europeana-image-dimensions
rwidom Aug 15, 2023
99293b0
handle shorter filetype strings
rwidom Aug 15, 2023
4d82980
Merge branch 'main' into fix-1484/europeana-image-dimensions
rwidom Aug 17, 2023
65db05a
Merge branch 'main' into fix-1484/europeana-image-dimensions
rwidom Aug 18, 2023
5776f1d
Merge branch 'main' into fix-1484/europeana-image-dimensions
rwidom Aug 19, 2023
43c98ec
Merge branch 'main' into fix-1484/europeana-image-dimensions
rwidom Aug 22, 2023
9b75639
reduce nesting
rwidom Aug 22, 2023
2a20b71
extract expected result constants
rwidom Aug 22, 2023
daa0c48
add tests for one dimension but not both
rwidom Aug 22, 2023
d356b23
Merge branch 'main' into fix-1484/europeana-image-dimensions
rwidom Aug 22, 2023
482995d
Merge branch 'main' into fix-1484/europeana-image-dimensions
AetherUnbound Aug 29, 2023
19054ab
check image url and reduce nesting
rwidom Aug 31, 2023
bc767df
remove extra test resource
rwidom Aug 31, 2023
a2f1c57
single record builder dict param
rwidom Aug 31, 2023
1e75440
Merge branch 'main' into fix-1484/europeana-image-dimensions
rwidom Aug 31, 2023
d1b0efc
add combined param to main get_record_data test
rwidom Aug 31, 2023
19cf649
make timeout 3 days & remove reingestion workflow
rwidom Aug 31, 2023
5924f05
Merge branch 'main' into fix-1484/europeana-image-dimensions
rwidom Sep 1, 2023
b7689d1
generate-dag-docs and clean up comments
rwidom Sep 6, 2023
965f1d3
Merge branch 'main' into fix-1484/europeana-image-dimensions
rwidom Sep 6, 2023
ef00712
"item webresource" to "item_webresource"
rwidom Sep 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 85 additions & 10 deletions catalog/dags/providers/provider_api_scripts/europeana.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,17 @@ class EuropeanaRecordBuilder:

def get_record_data(self, data: dict) -> dict | None:
try:
item_data = data.get("item webresource", {})
record = {
"foreign_landing_url": self._get_foreign_landing_url(data),
"url": self._get_image_url(data),
"foreign_identifier": self._get_foreign_identifier(data),
"meta_data": self._get_meta_data_dict(data),
"title": self._get_title(data),
"license_info": self._get_license_info(data),
}
"filetype": self._get_filetype(item_data),
"filesize": self._get_filesize(item_data),
} | self._get_image_dimensions(item_data)

data_providers = set(record["meta_data"]["dataProvider"])
eligible_sub_providers = {
Expand All @@ -81,13 +84,12 @@ def get_record_data(self, data: dict) -> dict | None:
f"image with foreign ID {record['foreign_identifier']}"
)

return record | {
"source": (
eligible_sub_providers.pop()
if len(eligible_sub_providers) == 1
else EuropeanaDataIngester.providers["image"]
)
}
record["source"] = (
eligible_sub_providers.pop()
if len(eligible_sub_providers) == 1
else EuropeanaDataIngester.providers["image"]
)
return {k: v for k, v in record.items() if v is not None}
except EmptyRequiredFieldException as exc:
logger.warning("A required field was empty", exc_info=exc)
return None
Expand Down Expand Up @@ -129,6 +131,22 @@ def _get_foreign_landing_url(self, data: dict) -> str:

return europeana_url

def _get_image_dimensions(self, item_data: dict) -> dict:
width = item_data.get("ebucoreWidth")
height = item_data.get("ebucoreHeight")
if width and height:
return {"width": width, "height": height}
return {}

def _get_filetype(self, item_data: dict) -> str:
if filetype := item_data.get("ebucoreHasMimeType"):
if "/" in filetype:
return item_data.get("ebucoreHasMimeType").split("/")[1]
return filetype

def _get_filesize(self, item_data: dict) -> int:
return item_data.get("ebucoreFileByteSize")

def _get_meta_data_dict(self, data: dict) -> dict:
meta_data = {
"country": data.get("country"),
Expand Down Expand Up @@ -159,7 +177,7 @@ class EuropeanaDataIngester(ProviderDataIngester):
providers = {"image": prov.EUROPEANA_DEFAULT_PROVIDER}
sub_providers = prov.EUROPEANA_SUB_PROVIDERS
endpoint = "https://api.europeana.eu/record/v2/search.json?"
delay = 30
delay = 3

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand Down Expand Up @@ -188,6 +206,9 @@ def __init__(self, *args, **kwargs):
"cursor": "*",
}

self.item_params = {
"wskey": Variable.get("API_KEY_EUROPEANA", default_var=None)
}
self.record_builder = EuropeanaRecordBuilder()

def _get_timestamp_query_param(self, date):
Expand Down Expand Up @@ -223,7 +244,61 @@ def get_batch_data(self, response_json: dict) -> None | list[dict]:
return response_json.get("items")

def get_record_data(self, data: dict) -> dict:
return self.record_builder.get_record_data(data)
return self.record_builder.get_record_data(
data | {"item webresource": self._get_additional_item_data(data)}
stacimc marked this conversation as resolved.
Show resolved Hide resolved
)

def _get_id_and_url(self, data) -> tuple:
try:
return (
self.record_builder._get_foreign_identifier(data),
self.record_builder._get_image_url(data),
)
except EmptyRequiredFieldException as exc:
logger.warning("Missing id or url", exc_info=exc)
return (None, None)

def _get_additional_item_data(self, data) -> dict:
# Delay of 30 seconds is fine for 100 items at a time, but not for each item.
# Occasionally getting this error on the item request when the delay is 3 secs:
# ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed
# connection without response'))
# But it always works on the next try so maybe that's ok?
# Options:
# - Switch to 3 second delay across the board and leave everything else alone,
# or maybe add a max number of this kind of failure per batch or per dag run.
# - Separate delayed requesters 30 seconds for batch, 1 second for items, same
# default batch size = 100.
# - Smaller batches (35), but default delay = 1 sec, like Brooklyn Museum.
# - Just go with defaults across the board, on the assumption that the batch
# query is the expensive one, and there will be at least 30 seconds between,
# given the 100 individual look-ups.
# Seems like might be best to reach out to Europeana and get their preference
# for the best way for us to proceed.
stacimc marked this conversation as resolved.
Show resolved Hide resolved
(item_id, url) = self._get_id_and_url(data)
if not (item_id and url):
return {}
item_response = self.get_response_json(
query_params=self.item_params,
endpoint=f"https://api.europeana.eu/record/v2{item_id}.json",
)
if not item_response or not item_response.get("success"):
logger.warning("Item request failed no response or ``success != True``")
return {}
aggregations = item_response.get("object", {}).get("aggregations", [])
for aggregation in aggregations:
return next(
(
resource
for resource in aggregation.get("webResources", [])
if (
resource.get("ebucoreHasMimeType", "").startswith("image")
and resource.get("about") == url
)
),
{},
)
return {}


def main(date):
Expand Down
10 changes: 0 additions & 10 deletions catalog/dags/providers/provider_reingestion_workflows.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from dataclasses import dataclass
from datetime import timedelta

from providers.provider_api_scripts.europeana import EuropeanaDataIngester
from providers.provider_api_scripts.flickr import FlickrDataIngester
from providers.provider_api_scripts.metropolitan_museum import MetMuseumDataIngester
from providers.provider_api_scripts.phylopic import PhylopicDataIngester
Expand Down Expand Up @@ -56,15 +55,6 @@ def __post_init__(self):


PROVIDER_REINGESTION_WORKFLOWS = [
ProviderReingestionWorkflow(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🥳

# 60 total reingestion days
ingester_class=EuropeanaDataIngester,
max_active_tasks=3,
pull_timeout=timedelta(hours=16),
daily_list_length=7,
one_month_list_length=12,
three_month_list_length=40,
),
ProviderReingestionWorkflow(
# 128 total reingestion days
ingester_class=FlickrDataIngester,
Expand Down
1 change: 1 addition & 0 deletions catalog/dags/providers/provider_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ def __post_init__(self):
start_date=datetime(2022, 10, 27),
schedule_string="@daily",
dated=True,
pull_timeout=timedelta(days=3),
),
ProviderWorkflow(
ingester_class=FinnishMuseumsDataIngester,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
{
"apikey": "test",
"success": true,
"statsDuration": 170,
"requestNumber": 999,
"object": {
"about": "/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6",
"aggregations": [
{
"about": "/aggregation/provider/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6",
"edmDataProvider": {
"def": ["http://data.europeana.eu/organization/1482250000004511854"]
},
"edmIsShownBy": "https://images.memorix.nl/nda/thumb/fullsize/010a31d7-9316-54d7-6ad0-83b9914688d9.jpg",
"edmIsShownAt": "http://nimh-beeldbank.defensie.nl/memorix/0000ee69-1b9e-6823-478f-c88af61736c6",
"edmObject": "https://images.memorix.nl/nda/thumb/640x480/010a31d7-9316-54d7-6ad0-83b9914688d9.jpg",
"edmProvider": {
"def": ["http://data.europeana.eu/organization/1482250000004671093"]
},
"edmRights": {
"def": ["http://creativecommons.org/publicdomain/zero/1.0/"]
},
"edmUgc": "false",
"aggregatedCHO": "/item/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6",
"webResources": [
{
"about": "happy_url",
"textAttributionSnippet": " - https://www.europeana.eu/item/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6. Fotovlucht Vliegbasis Soesterberg. Nederlands Instituut voor Militaire Historie - http://nimh-beeldbank.defensie.nl/memorix/0000ee69-1b9e-6823-478f-c88af61736c6. CC0 - http://creativecommons.org/publicdomain/zero/1.0/",
"htmlAttributionSnippet": "<link rel='stylesheet' type='text/css' href='https://api.europeana.eu/attribution/style.css'/><dl class='europeana-attribution' lang='en'><dt>Creator</dt><dd lang=''>Fotovlucht Vliegbasis Soesterberg</dd><dt>Institution</dt><dd lang=''><a href='http://nimh-beeldbank.defensie.nl/memorix/0000ee69-1b9e-6823-478f-c88af61736c6' target='_blank' rel='noopener'>Nederlands Instituut voor Militaire Historie</a></dd><dt>Country</dt><dd lang=''>Netherlands</dd><dt>Rights</dt><dd><a href='http://creativecommons.org/publicdomain/zero/1.0/' target='_blank' rel='noopener'><span class='icon-cc'/><span class='icon-zero'/>CC0</a></dd></dl>",
"ebucoreHasMimeType": "image/jpeg",
"ebucoreFileByteSize": 36272,
"ebucoreWidth": 381,
"ebucoreHeight": 480,
"edmHasColorSpace": "sRGB",
"edmComponentColor": [
"#FF6347",
"#556B2F",
"#E9967A",
"#A0522D",
"#FFEBCD",
"#DAA520"
],
"ebucoreOrientation": "portrait"
},
{
"about": "http://nimh-beeldbank.defensie.nl/memorix/0000ee69-1b9e-6823-478f-c88af61736c6",
"textAttributionSnippet": " - https://www.europeana.eu/item/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6. Fotovlucht Vliegbasis Soesterberg. Nederlands Instituut voor Militaire Historie - http://nimh-beeldbank.defensie.nl/memorix/0000ee69-1b9e-6823-478f-c88af61736c6. CC0 - http://creativecommons.org/publicdomain/zero/1.0/",
"htmlAttributionSnippet": "<link rel='stylesheet' type='text/css' href='https://api.europeana.eu/attribution/style.css'/><dl class='europeana-attribution' lang='en'><dt>Creator</dt><dd lang=''>Fotovlucht Vliegbasis Soesterberg</dd><dt>Institution</dt><dd lang=''><a href='http://nimh-beeldbank.defensie.nl/memorix/0000ee69-1b9e-6823-478f-c88af61736c6' target='_blank' rel='noopener'>Nederlands Instituut voor Militaire Historie</a></dd><dt>Country</dt><dd lang=''>Netherlands</dd><dt>Rights</dt><dd><a href='http://creativecommons.org/publicdomain/zero/1.0/' target='_blank' rel='noopener'><span class='icon-cc'/><span class='icon-zero'/>CC0</a></dd></dl>",
"ebucoreHasMimeType": "text/html",
"ebucoreFileByteSize": 0
},
{
"about": "https://images.memorix.nl/nda/thumb/fullsize/010a31d7-9316-54d7-6ad0-83b9914688d9.jpg",
"textAttributionSnippet": " - https://www.europeana.eu/item/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6. Fotovlucht Vliegbasis Soesterberg. Nederlands Instituut voor Militaire Historie - http://nimh-beeldbank.defensie.nl/memorix/0000ee69-1b9e-6823-478f-c88af61736c6. CC0 - http://creativecommons.org/publicdomain/zero/1.0/",
"htmlAttributionSnippet": "<link rel='stylesheet' type='text/css' href='https://api.europeana.eu/attribution/style.css'/><dl class='europeana-attribution' lang='en'><dt>Creator</dt><dd lang=''>Fotovlucht Vliegbasis Soesterberg</dd><dt>Institution</dt><dd lang=''><a href='http://nimh-beeldbank.defensie.nl/memorix/0000ee69-1b9e-6823-478f-c88af61736c6' target='_blank' rel='noopener'>Nederlands Instituut voor Militaire Historie</a></dd><dt>Country</dt><dd lang=''>Netherlands</dd><dt>Rights</dt><dd><a href='http://creativecommons.org/publicdomain/zero/1.0/' target='_blank' rel='noopener'><span class='icon-cc'/><span class='icon-zero'/>CC0</a></dd></dl>",
"dctermsIsReferencedBy": [
"https://iiif.europeana.eu/presentation/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6/manifest"
],
"ebucoreHasMimeType": "image/jpeg",
"ebucoreFileByteSize": 157858,
"ebucoreWidth": 795,
"ebucoreHeight": 1000,
"edmHasColorSpace": "sRGB",
"edmComponentColor": [
"#FFEBCD",
"#8B0000",
"#FFFAFA",
"#FFEFD5",
"#FFD700",
"#E9967A"
],
"ebucoreOrientation": "portrait"
},
{
"about": "https://iiif.europeana.eu/presentation/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6/manifest",
"rdfType": "http://iiif.io/api/presentation/3#Manifest"
}
]
}
],
"edmDatasetName": ["2021650_Ag_NL_DigitaleCollectie_nimh"],
"europeanaAggregation": {
"about": "/aggregation/europeana/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6",
"aggregatedCHO": "/item/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6",
"edmCountry": {
"def": ["Netherlands"]
},
"edmLanguage": {
"def": ["nl"]
},
"edmPreview": "https://api.europeana.eu/thumbnail/v2/url.json?uri=https%3A%2F%2Fimages.memorix.nl%2Fnda%2Fthumb%2F640x480%2F010a31d7-9316-54d7-6ad0-83b9914688d9.jpg&type=IMAGE",
"edmLandingPage": "https://www.europeana.eu/item/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6",
"dqvHasQualityAnnotation": [
"/item/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6#contentTier",
"/item/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6#metadataTier"
]
},
"europeanaCollectionName": ["2021650_Ag_NL_DigitaleCollectie_nimh"],
"europeanaCompleteness": 10,
"organizations": [
{
"about": "http://data.europeana.eu/organization/1482250000004671093",
"prefLabel": {
"en": ["Dutch Collections for Europe"]
}
},
{
"about": "http://data.europeana.eu/organization/1482250000004511854",
"prefLabel": {
"en": ["Netherlands Institute for Military History"],
"nl": ["Nederlands Instituut voor Militaire Historie"]
}
}
],
"providedCHOs": [
{
"about": "/item/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6"
}
],
"proxies": [
{
"about": "/proxy/europeana/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6",
"dcIdentifier": {
"def": [
"http://nimh-beeldbank.defensie.nl/memorix/0000ee69-1b9e-6823-478f-c88af61736c6"
]
},
"proxyIn": [
"/aggregation/europeana/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6"
],
"proxyFor": "/item/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6",
"lineage": [
"/proxy/provider/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6"
],
"europeanaProxy": true
},
{
"about": "/proxy/provider/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6",
"dcCreator": {
"def": ["Fotovlucht Vliegbasis Soesterberg"]
},
"dcDescription": {
"def": [
"Een Agusta Bell AB-412SP van 303 Squadron boven het betonningsvaartuig Terschelling (1988-) van Rijkswaterstaat, ingedeeld bij de Kustwacht, tijdens de\n reddingsoefening 'Get Wet'.&nbsp;",
"Een Agusta Bell AB-412SP van het 303 squadron van de Koninklijke Luchtmacht hangt boven het schip de 'Terschelling' van de kustwacht, tijdens de oefening 'Get wet' boven de\n Noordzee."
]
},
"dcIdentifier": {
"def": ["2156_018660"]
},
"dcSubject": {
"def": [
"R-02",
"Koninklijke Luchtmacht, squadrons Koninklijke Luchtmacht, 303 Squadron (KLu)",
"kustwacht"
]
},
"dcType": {
"def": ["Digitaal fotobestand"]
},
"dctermsCreated": {
"def": ["2003-09-03 /"]
},
"dctermsIsPartOf": {
"def": ["Luchtfotoarchief Vliegbasis Soesterberg"]
},
"dctermsSpatial": {
"def": ["Noordzee, Nederland"]
},
"proxyIn": [
"/aggregation/provider/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6"
],
"proxyFor": "/item/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6",
"edmType": "IMAGE",
"europeanaProxy": false
}
],
"qualityAnnotations": [
{
"about": "/item/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6#contentTier",
"created": "2023-08-01T14:07:27.421851Z",
"target": [
"/aggregation/provider/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6"
],
"body": "http://www.europeana.eu/schemas/epf/contentTier2"
},
{
"about": "/item/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6#metadataTier",
"created": "2023-08-01T14:07:27.421873Z",
"target": [
"/aggregation/provider/2021650/memorix_0000ee69_1b9e_6823_478f_c88af61736c6"
],
"body": "http://www.europeana.eu/schemas/epf/metadataTier0"
}
],
"timestamp_created": "2022-03-02T15:35:41.563Z",
"timestamp_created_epoch": 1646235341563,
"timestamp_update": "2023-08-01T14:02:32.057Z",
"timestamp_update_epoch": 1690898552057,
"type": "IMAGE"
}
}
Loading