Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace media_url with url in provider scripts #1891

Merged
merged 3 commits into from
May 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions catalog/dags/common/storage/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(
def add_item(
self,
foreign_landing_url: str,
audio_url: str,
url: str,
license_info: LicenseInfo,
foreign_identifier: str,
thumbnail_url: str | None = None,
Expand Down Expand Up @@ -79,7 +79,7 @@ def add_item(

foreign_landing_url: URL of page where the audio lives on the
source website.
audio_url: Direct link to the audio file
url: Direct link to the audio file
license_info: LicenseInfo object that has
- the URL of the license for the audio,
- string representation of the license,
Expand Down Expand Up @@ -154,7 +154,7 @@ def add_item(

audio_data = {
"foreign_landing_url": foreign_landing_url,
"audio_url": audio_url,
"url": url,
"license_info": license_info,
"thumbnail_url": thumbnail_url,
"filesize": filesize,
Expand Down Expand Up @@ -188,9 +188,6 @@ def _get_audio(self, **kwargs) -> Audio | None:
audio_metadata = self.clean_media_metadata(**kwargs)
if audio_metadata is None:
return None
# Convert the `audio_url` key used in AudioStore, TSV and
# provider API scripts into `url` key used in db
audio_metadata["url"] = audio_metadata.pop("audio_url")
# Validate that duration does not exceed Postgres int maximum
audio_metadata["duration"] = self._validate_integer(
audio_metadata.get("duration")
Expand Down
9 changes: 3 additions & 6 deletions catalog/dags/common/storage/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(
def add_item(
self,
foreign_landing_url: str,
image_url: str,
url: str,
license_info: LicenseInfo,
foreign_identifier: str,
thumbnail_url: str | None = None,
Expand All @@ -70,7 +70,7 @@ def add_item(
Required Arguments:
foreign_landing_url: URL of page where the image lives on the
source website.
image_url: Direct link to the image file
url: Direct link to the image file

license_info: LicenseInfo object that has
- the URL of the license for the image,
Expand Down Expand Up @@ -125,7 +125,7 @@ def add_item(

image_data = {
"foreign_landing_url": foreign_landing_url,
"image_url": image_url,
"url": url,
"thumbnail_url": thumbnail_url,
"filesize": filesize,
"filetype": filetype,
Expand Down Expand Up @@ -153,9 +153,6 @@ def _get_image(self, **kwargs) -> Image | None:
image_metadata = self.clean_media_metadata(**kwargs)
if image_metadata is None:
return None
# Convert the `image_url` key used in ImageStore, TSV and
# provider API scripts into `url` key used in db
image_metadata["url"] = image_metadata.pop("image_url")
return Image(**image_metadata)


Expand Down
6 changes: 3 additions & 3 deletions catalog/dags/common/storage/media.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,13 +116,13 @@ def clean_media_metadata(self, **media_data) -> dict | None:
"license_info",
"foreign_identifier",
"foreign_landing_url",
f"{self.media_type}_url",
"url",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whew, this is a nice change to see!

]:
if media_data.get(field) is None:
raise ValueError(f"Record missing required field: `{field}`")

for field in [
f"{self.media_type}_url",
"url",
"foreign_landing_url",
"thumbnail_url",
"creator_url",
Expand All @@ -143,7 +143,7 @@ def clean_media_metadata(self, **media_data) -> dict | None:
media_data["ingestion_type"] = "provider_api"

media_data["filetype"] = self._validate_filetype(
media_data["filetype"], media_data[f"{self.media_type}_url"]
media_data["filetype"], media_data["url"]
)
media_data["filesize"] = self._validate_integer(media_data.get("filesize"))

Expand Down
2 changes: 1 addition & 1 deletion catalog/dags/common/tsv_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def _process_row(tsv_row):
image_store = _image_store_dict[row_image.provider]
image_store.add_item(
foreign_landing_url=row_image.foreign_landing_url,
image_url=row_image.url,
url=row_image.url,
thumbnail_url=row_image.thumbnail_url,
license_info=get_license_info(
license_url=get_license_url(row_meta_data),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def _handle_object_data(data, license_info: LicenseInfo) -> list[dict]:
images.append(
{
"foreign_landing_url": foreign_landing_url,
"image_url": image_url,
"url": image_url,
"license_info": license_info,
"foreign_identifier": foreign_identifier,
"width": width,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def get_record_data(self, data):
"foreign_landing_url": foreign_landing_url,
"title": data.get("title", None),
"creator": creator_name,
"image_url": image["url"],
"url": image["url"],
"width": self._get_int_value(image, "width"),
"height": self._get_int_value(image, "height"),
"filesize": self._get_int_value(image, "filesize"),
Expand Down
2 changes: 1 addition & 1 deletion catalog/dags/providers/provider_api_scripts/europeana.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def get_record_data(self, data: dict) -> dict | None:
try:
record = {
"foreign_landing_url": self._get_foreign_landing_url(data),
"image_url": self._get_image_url(data),
"url": self._get_image_url(data),
"foreign_identifier": self._get_foreign_identifier(data),
"meta_data": self._get_meta_data_dict(data),
"title": self._get_title(data),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def get_record_data(self, data):
"license_info": license_info,
"foreign_identifier": foreign_identifier,
"foreign_landing_url": foreign_landing_url,
"image_url": image_url,
"url": image_url,
"title": title,
"source": source,
"creator": creator,
Expand Down
4 changes: 2 additions & 2 deletions catalog/dags/providers/provider_api_scripts/flickr.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def get_record_data(self, data):
return None

image_size = self._get_largest_image_size(data)
if not (image_url := data.get(f"url_{image_size}")):
if not (url := data.get(f"url_{image_size}")):
return None

if not (foreign_identifier := data.get("id")):
Expand Down Expand Up @@ -254,7 +254,7 @@ def get_record_data(self, data):

return {
"foreign_landing_url": foreign_landing_url,
"image_url": image_url,
"url": url,
"license_info": license_info,
"foreign_identifier": foreign_identifier,
"width": width,
Expand Down
6 changes: 3 additions & 3 deletions catalog/dags/providers/provider_api_scripts/freesound.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def _get_audio_files(
return None, None

main_file = {
"audio_url": preview_url,
"url": preview_url,
"filetype": self.preferred_preview.split("-")[-1],
"bit_rate": FreesoundDataIngester.preview_bitrates[self.preferred_preview],
"filesize": int(filesize),
Expand Down Expand Up @@ -227,7 +227,7 @@ def get_record_data(self, media_data: dict) -> dict | list[dict] | None:
if not (item_license := get_license_info(media_data.get("license"))):
return None

# We use the mp3-hq preview url as `audio_url` as the main url
# We use the mp3-hq preview url as `url` as the main url
# for playing on the frontend,
# and the actual uploaded file as an alt_file that is available
# for download (and requires a user to be authenticated to download)
Expand Down Expand Up @@ -258,7 +258,7 @@ def get_record_data(self, media_data: dict) -> dict | list[dict] | None:
"audio_set": audio_set,
"set_url": set_url,
"alt_files": alt_files,
# audio_url, filetype, bit_rate
# url, filetype, bit_rate
**main_audio,
}

Expand Down
10 changes: 5 additions & 5 deletions catalog/dags/providers/provider_api_scripts/jamendo.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,12 +124,12 @@ def _get_audio_url(self, data):
>>> _remove_param_from_url(url, "from")
'https://prod-1.storage.jamendo.com/?trackid=1532771&format=mp31'
:return: Tuple with main audio file information:
- audio_url
- url
- duration (in milliseconds)
"""
if not (audio_url := data.get("audio")):
if not (url := data.get("audio")):
return None
return self._remove_param_from_url(audio_url, "from")
return self._remove_param_from_url(url, "from")

@staticmethod
def _get_creator_data(data):
Expand Down Expand Up @@ -184,7 +184,7 @@ def get_record_data(self, data):
if not (foreign_landing_url := data.get("shareurl")):
return None

if not (audio_url := self._get_audio_url(data)):
if not (url := self._get_audio_url(data)):
return None

if not (license_info := get_license_info(data.get("license_ccurl"))):
Expand Down Expand Up @@ -221,7 +221,7 @@ def get_record_data(self, data):
"creator_url": creator_url,
"foreign_identifier": foreign_identifier,
"foreign_landing_url": foreign_landing_url,
"audio_url": audio_url,
"url": url,
"duration": duration,
"filetype": filetype,
"thumbnail_url": thumbnail,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def get_record_data(self, object_id):
return [
{
"foreign_landing_url": foreign_landing_url,
"image_url": img,
"url": img,
"license_info": self.DEFAULT_LICENSE_INFO,
"foreign_identifier": self._get_foreign_id(object_id, img),
"creator": artist,
Expand Down
10 changes: 4 additions & 6 deletions catalog/dags/providers/provider_api_scripts/museum_victoria.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

class ImageDetails(TypedDict, total=False):
foreign_identifier: str
image_url: str
url: str
license_info: LicenseInfo
foreign_landing_url: str
title: str
Expand Down Expand Up @@ -97,17 +97,15 @@ def _get_images(media_data) -> list[ImageDetails]:
continue
if not (foreign_identifier := media.get("id")):
continue
image_url, height, width, filesize = VictoriaDataIngester._get_image_data(
media
)
url, height, width, filesize = VictoriaDataIngester._get_image_data(media)
license_info = VictoriaDataIngester._get_license_info(media)
if not image_url or not license_info:
if not url or not license_info:
continue
creator = VictoriaDataIngester._get_creator(media)

image: ImageDetails = {
"foreign_identifier": foreign_identifier,
"image_url": image_url,
"url": url,
"height": height,
"width": width,
"license_info": license_info,
Expand Down
4 changes: 2 additions & 2 deletions catalog/dags/providers/provider_api_scripts/nappy.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def get_record_data(self, data: dict) -> dict | None:
if not (foreign_landing_url := data.get("foreign_landing_url")):
return None

if not (image_url := data.get("url")):
if not (url := data.get("url")):
return None

if not (foreign_identifier := data.get("foreign_identifier")):
Expand All @@ -95,7 +95,7 @@ def get_record_data(self, data: dict) -> dict | None:

return {
"foreign_landing_url": foreign_landing_url,
"image_url": image_url,
"url": url,
"thumbnail_url": thumbnail_url,
"license_info": self.license_info,
"foreign_identifier": foreign_identifier,
Expand Down
22 changes: 10 additions & 12 deletions catalog/dags/providers/provider_api_scripts/nypl.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class NyplDataIngester(ProviderDataIngester):
# NYPL returns a list of image objects, with the dimension encoded
# in the URL's query parameter.
# This list is in order from the largest image to the smallest one.
image_url_dimensions = ["g", "v", "q", "w", "r"]
url_dimensions = ["g", "v", "q", "w", "r"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe this would be better as image_dimensions?


def __init__(self, *args, **kwargs):
NYPL_API = Variable.get("API_KEY_NYPL")
Expand Down Expand Up @@ -117,23 +117,21 @@ def get_record_data(self, data):
continue

image_link = capture.get("imageLinks", {}).get("imageLink", [])
image_url, filetype = NyplDataIngester._get_image_data(image_link)
if not image_url:
url, filetype = NyplDataIngester._get_image_data(image_link)
if not url:
continue

foreign_landing_url = capture.get("itemLink", {}).get("$")
if not foreign_landing_url:
continue
if not (foreign_landing_url := capture.get("itemLink", {}).get("$")):
continue

license_url = capture.get("rightsStatementURI", {}).get("$")
if not (license_info := get_license_info(license_url)):
continue

image_data = {
"foreign_identifier": foreign_identifier,
"foreign_landing_url": foreign_landing_url,
"image_url": image_url,
"url": url,
"license_info": license_info,
"title": title,
"creator": creator,
Expand Down Expand Up @@ -186,9 +184,9 @@ def _get_image_data(images) -> tuple[None, None] | tuple[str, str]:
"description": "Cropped .jpeg (1600 pixels on the long side)"
}
Selects the largest image based on the image URL's `t` query parameter
and image_url_dimensions.
and url_dimensions.
"""
# Create a dict with the NyplDataIngester.image_url_dimensions as keys,
# Create a dict with the NyplDataIngester.url_dimensions as keys,
# and image data as value.
image_types = {
parse_qs(urlparse(img["$"]).query)["t"][0]: i
Expand All @@ -200,17 +198,17 @@ def _get_image_data(images) -> tuple[None, None] | tuple[str, str]:
# Select the dict containing the URL for the largest image.
# The image size is encoded in the URL query parameter `t`.
# The list of dimensions is sorted by size of the corresponding image.
for dimension in NyplDataIngester.image_url_dimensions:
for dimension in NyplDataIngester.url_dimensions:
preferred_image_index = image_types.get(dimension)
if preferred_image_index is not None:
preferred_image = images[preferred_image_index]

# Removes the `download` query to get the viewable image URL
image_url = preferred_image["$"].replace("&download=1", "")
url = preferred_image["$"].replace("&download=1", "")
filetype = NyplDataIngester._get_filetype(
preferred_image["description"]
)
return image_url, filetype
return url, filetype

return None, None

Expand Down
30 changes: 15 additions & 15 deletions catalog/dags/providers/provider_api_scripts/phylopic.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,31 +89,31 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None:
TODO: Adapt `url` and `creator_url` to avoid redirects.
"""

uid = data.get("uuid")
if not uid:
if not (foreign_identifier := data.get("uuid")):
return None

data = data.get("_links", {})
img_url = data.get("sourceFile", {}).get("href")
foreign_url = data.get("self", {}).get("href")
if not img_url or not foreign_url:
links = data.get("_links", {})

if not (url := links.get("sourceFile", {}).get("href")):
return None

license_url = data.get("license", {}).get("href")
if not (license_info := get_license_info(license_url)):
if not (foreign_url_path := links.get("self", {}).get("href")):
return None
foreign_landing_url = self.host + foreign_url_path

foreign_url = self.host + foreign_url
license_url = links.get("license", {}).get("href")
if not (license_info := get_license_info(license_url)):
return None

title = data.get("self", {}).get("title")
creator, creator_url = self._get_creator(data.get("contributor", {}))
width, height = self._get_image_sizes(data)
title = links.get("self", {}).get("title")
creator, creator_url = self._get_creator(links.get("contributor", {}))
width, height = self._get_image_sizes(links)

return {
"license_info": license_info,
"foreign_identifier": uid,
"foreign_landing_url": foreign_url,
"image_url": img_url,
"foreign_identifier": foreign_identifier,
"foreign_landing_url": foreign_landing_url,
"url": url,
"title": title,
"creator": creator,
"creator_url": creator_url,
Expand Down
Loading