Skip to content

Commit

Permalink
Replace media_url with url in provider scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
obulat committed Apr 23, 2023
1 parent 1cb13fd commit 96fa262
Show file tree
Hide file tree
Showing 52 changed files with 156 additions and 177 deletions.
9 changes: 3 additions & 6 deletions catalog/dags/common/storage/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(
def add_item(
self,
foreign_landing_url: str,
audio_url: str,
url: str,
license_info: LicenseInfo,
thumbnail_url: str | None = None,
filesize: int | None = None,
Expand Down Expand Up @@ -78,7 +78,7 @@ def add_item(
foreign_landing_url: URL of page where the audio lives on the
source website.
audio_url: Direct link to the audio file
url: Direct link to the audio file
license_info: LicenseInfo object that has
- the URL of the license for the audio,
- string representation of the license,
Expand Down Expand Up @@ -154,7 +154,7 @@ def add_item(

audio_data = {
"foreign_landing_url": foreign_landing_url,
"audio_url": audio_url,
"url": url,
"license_info": license_info,
"thumbnail_url": thumbnail_url,
"filesize": filesize,
Expand Down Expand Up @@ -188,9 +188,6 @@ def _get_audio(self, **kwargs) -> Audio | None:
audio_metadata = self.clean_media_metadata(**kwargs)
if audio_metadata is None:
return None
# Convert the `audio_url` key used in AudioStore, TSV and
# provider API scripts into `url` key used in db
audio_metadata["url"] = audio_metadata.pop("audio_url")
# Validate that duration does not exceed Postgres int maximum
audio_metadata["duration"] = self._validate_integer(
audio_metadata.get("duration")
Expand Down
9 changes: 3 additions & 6 deletions catalog/dags/common/storage/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(
def add_item(
self,
foreign_landing_url: str,
image_url: str,
url: str,
license_info: LicenseInfo,
thumbnail_url: str | None = None,
filesize: int | None = None,
Expand All @@ -69,7 +69,7 @@ def add_item(
Required Arguments:
foreign_landing_url: URL of page where the image lives on the
source website.
image_url: Direct link to the image file
url: Direct link to the image file
license_info: LicenseInfo object that has
- the URL of the license for the image,
Expand Down Expand Up @@ -124,7 +124,7 @@ def add_item(

image_data = {
"foreign_landing_url": foreign_landing_url,
"image_url": image_url,
"url": url,
"thumbnail_url": thumbnail_url,
"filesize": filesize,
"filetype": filetype,
Expand Down Expand Up @@ -152,9 +152,6 @@ def _get_image(self, **kwargs) -> Image | None:
image_metadata = self.clean_media_metadata(**kwargs)
if image_metadata is None:
return None
# Convert the `image_url` key used in ImageStore, TSV and
# provider API scripts into `url` key used in db
image_metadata["url"] = image_metadata.pop("image_url")
return Image(**image_metadata)


Expand Down
6 changes: 3 additions & 3 deletions catalog/dags/common/storage/media.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,13 @@ def clean_media_metadata(self, **media_data) -> dict | None:
for field in [
"foreign_identifier",
"foreign_landing_url",
f"{self.media_type}_url",
"url",
]:
if media_data.get(field) is None:
raise ValueError(f"Record missing required field: `{field}`")

for field in [
f"{self.media_type}_url",
"url",
"foreign_landing_url",
"thumbnail_url",
"creator_url",
Expand All @@ -150,7 +150,7 @@ def clean_media_metadata(self, **media_data) -> dict | None:
media_data["ingestion_type"] = "provider_api"

media_data["filetype"] = self._validate_filetype(
media_data["filetype"], media_data[f"{self.media_type}_url"]
media_data["filetype"], media_data["url"]
)
media_data["filesize"] = self._validate_integer(media_data.get("filesize"))

Expand Down
2 changes: 1 addition & 1 deletion catalog/dags/common/tsv_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def _process_row(tsv_row):
image_store = _image_store_dict[row_image.provider]
image_store.add_item(
foreign_landing_url=row_image.foreign_landing_url,
image_url=row_image.url,
url=row_image.url,
thumbnail_url=row_image.thumbnail_url,
license_info=get_license_info(
license_url=get_license_url(row_meta_data),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def _handle_object_data(data, license_url) -> list[dict]:
images.append(
{
"foreign_landing_url": foreign_url,
"image_url": image_url,
"url": image_url,
"license_info": license_info,
"foreign_identifier": foreign_id,
"width": width,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def get_record_data(self, data):
"foreign_landing_url": data.get("url"),
"title": data.get("title", None),
"creator": creator_name,
"image_url": image["url"],
"url": image["url"],
"width": self._get_int_value(image, "width"),
"height": self._get_int_value(image, "height"),
"filesize": self._get_int_value(image, "filesize"),
Expand Down
2 changes: 1 addition & 1 deletion catalog/dags/providers/provider_api_scripts/europeana.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def get_record_data(self, data: dict) -> dict:
try:
record = {
"foreign_landing_url": self._get_foreign_landing_url(data),
"image_url": self._get_image_url(data),
"url": self._get_image_url(data),
"foreign_identifier": self._get_foreign_identifier(data),
"meta_data": self._get_meta_data_dict(data),
"title": self._get_title(data),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def get_record_data(self, data):
"license_info": get_license_info(license_url),
"foreign_identifier": foreign_identifier,
"foreign_landing_url": foreign_landing_url,
"image_url": image_url,
"url": image_url,
"title": title,
"source": source,
"creator": creator,
Expand Down
4 changes: 2 additions & 2 deletions catalog/dags/providers/provider_api_scripts/flickr.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def get_record_data(self, data):
return None

image_size = self._get_largest_image_size(data)
if (image_url := data.get(f"url_{image_size}")) is None:
if not (url := data.get(f"url_{image_size}")):
return None

if (foreign_id := data.get("id")) is None:
Expand Down Expand Up @@ -254,7 +254,7 @@ def get_record_data(self, data):

return {
"foreign_landing_url": foreign_landing_url,
"image_url": image_url,
"url": url,
"license_info": license_info,
"foreign_identifier": foreign_id,
"width": width,
Expand Down
6 changes: 3 additions & 3 deletions catalog/dags/providers/provider_api_scripts/freesound.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def _get_audio_files(
return None, None

main_file = {
"audio_url": preview_url,
"url": preview_url,
"filetype": self.preferred_preview.split("-")[-1],
"bit_rate": FreesoundDataIngester.preview_bitrates[self.preferred_preview],
"filesize": int(filesize),
Expand Down Expand Up @@ -239,7 +239,7 @@ def get_record_data(self, media_data: dict) -> dict | list[dict] | None:
if item_license is None:
return None

# We use the mp3-hq preview url as `audio_url` as the main url
# We use the mp3-hq preview url as `url` as the main url
# for playing on the frontend,
# and the actual uploaded file as an alt_file that is available
# for download (and requires a user to be authenticated to download)
Expand Down Expand Up @@ -270,7 +270,7 @@ def get_record_data(self, media_data: dict) -> dict | list[dict] | None:
"audio_set": audio_set,
"set_url": set_url,
"alt_files": alt_files,
# audio_url, filetype, bit_rate
# url, filetype, bit_rate
**main_audio,
}

Expand Down
10 changes: 5 additions & 5 deletions catalog/dags/providers/provider_api_scripts/jamendo.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,12 +124,12 @@ def _get_audio_url(self, data):
>>> _remove_param_from_url(url, "from")
'https://prod-1.storage.jamendo.com/?trackid=1532771&format=mp31'
:return: Tuple with main audio file information:
- audio_url
- url
- duration (in milliseconds)
"""
if (audio_url := data.get("audio")) is None:
if not (url := data.get("audio")):
return None
return self._remove_param_from_url(audio_url, "from")
return self._remove_param_from_url(url, "from")

@staticmethod
def _get_creator_data(data):
Expand Down Expand Up @@ -184,7 +184,7 @@ def get_record_data(self, data):
if (foreign_landing_url := data.get("shareurl")) is None:
return None

if (audio_url := self._get_audio_url(data)) is None:
if (url := self._get_audio_url(data)) is None:
return None

license_url = data.get("license_ccurl")
Expand Down Expand Up @@ -224,7 +224,7 @@ def get_record_data(self, data):
"creator_url": creator_url,
"foreign_identifier": foreign_identifier,
"foreign_landing_url": foreign_landing_url,
"audio_url": audio_url,
"url": url,
"duration": duration,
"filetype": filetype,
"thumbnail_url": thumbnail,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def get_record_data(self, object_id):
return [
{
"foreign_landing_url": foreign_landing_url,
"image_url": img,
"url": img,
"license_info": self.DEFAULT_LICENSE_INFO,
"foreign_identifier": self._get_foreign_id(object_id, img),
"creator": artist,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def _get_images(media_data) -> list[ImageDetails]:

image: ImageDetails = {
"foreign_identifier": image_id,
"image_url": image_url,
"url": image_url,
"height": height,
"width": width,
"license_info": license_info,
Expand Down
2 changes: 1 addition & 1 deletion catalog/dags/providers/provider_api_scripts/nappy.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None:

return {
"foreign_landing_url": foreign_landing_url,
"image_url": image_url,
"url": image_url,
"thumbnail_url": thumbnail_url,
"license_info": self.license_info,
"foreign_identifier": foreign_identifier,
Expand Down
18 changes: 9 additions & 9 deletions catalog/dags/providers/provider_api_scripts/nypl.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class NyplDataIngester(ProviderDataIngester):
# NYPL returns a list of image objects, with the dimension encoded
# in the URL's query parameter.
# This list is in order from the largest image to the smallest one.
image_url_dimensions = ["g", "v", "q", "w", "r"]
url_dimensions = ["g", "v", "q", "w", "r"]

def __init__(self, *args, **kwargs):
NYPL_API = Variable.get("API_KEY_NYPL")
Expand Down Expand Up @@ -120,8 +120,8 @@ def get_record_data(self, data):
continue

image_link = capture.get("imageLinks", {}).get("imageLink", [])
image_url, filetype = self._get_image_data(image_link)
if not image_url:
url, filetype = self._get_image_data(image_link)
if not url:
continue

foreign_landing_url = capture.get("itemLink", {}).get("$")
Expand All @@ -132,7 +132,7 @@ def get_record_data(self, data):
image_data = {
"foreign_identifier": image_id,
"foreign_landing_url": foreign_landing_url,
"image_url": image_url,
"url": url,
"license_info": get_license_info(license_url=license_url),
"title": title,
"creator": creator,
Expand Down Expand Up @@ -171,9 +171,9 @@ def _get_image_data(images) -> tuple[None, None] | tuple[str, str]:
"description": "Cropped .jpeg (1600 pixels on the long side)"
}
Selects the largest image based on the image URL's `t` query parameter
and image_url_dimensions.
and url_dimensions.
"""
# Create a dict with the NyplDataIngester.image_url_dimensions as keys,
# Create a dict with the NyplDataIngester.url_dimensions as keys,
# and image data as value.
image_types = {
parse_qs(urlparse(img["$"]).query)["t"][0]: i
Expand All @@ -185,17 +185,17 @@ def _get_image_data(images) -> tuple[None, None] | tuple[str, str]:
# Select the dict containing the URL for the largest image.
# The image size is encoded in the URL query parameter `t`.
# The list of dimensions is sorted by size of the corresponding image.
for dimension in NyplDataIngester.image_url_dimensions:
for dimension in NyplDataIngester.url_dimensions:
preferred_image_index = image_types.get(dimension)
if preferred_image_index is not None:
preferred_image = images[preferred_image_index]

# Removes the `download` query to get the viewable image URL
image_url = preferred_image["$"].replace("&download=1", "")
url = preferred_image["$"].replace("&download=1", "")
filetype = NyplDataIngester._get_filetype(
preferred_image["description"]
)
return image_url, filetype
return url, filetype

return None, None

Expand Down
2 changes: 1 addition & 1 deletion catalog/dags/providers/provider_api_scripts/phylopic.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None:
"license_info": get_license_info(license_url=license_url),
"foreign_identifier": uid,
"foreign_landing_url": foreign_url,
"image_url": img_url,
"url": img_url,
"title": title,
"creator": creator,
"creator_url": creator_url,
Expand Down
2 changes: 1 addition & 1 deletion catalog/dags/providers/provider_api_scripts/rawpixel.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None:
width, height = self._get_image_properties(data)
return {
"foreign_landing_url": foreign_url,
"image_url": image_url,
"url": image_url,
"license_info": license_info,
"foreign_identifier": foreign_id,
"width": width,
Expand Down
22 changes: 11 additions & 11 deletions catalog/dags/providers/provider_api_scripts/science_museum.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,12 @@ def get_record_data(self, record):
continue
processed = image_data.get("processed")
(
image_url,
url,
height,
width,
filetype,
) = self._get_image_info(processed)
if image_url is None:
if url is None:
continue

license_pair = self._get_license(image_data)
Expand All @@ -149,7 +149,7 @@ def get_record_data(self, record):
image = {
"foreign_identifier": foreign_id,
"foreign_landing_url": foreign_landing_url,
"image_url": image_url,
"url": url,
"height": height,
"width": width,
"filetype": filetype,
Expand All @@ -173,12 +173,12 @@ def _get_creator_info(attributes):
return creator_info

@staticmethod
def check_url(image_url: str | None) -> str | None:
if not image_url:
def check_url(url: str | None) -> str | None:
if not url:
return None
if image_url.startswith("http"):
return image_url
return f"https://coimages.sciencemuseumgroup.org.uk/images/{image_url}"
if url.startswith("http"):
return url
return f"https://coimages.sciencemuseumgroup.org.uk/images/{url}"

@staticmethod
def _get_dimensions(image_data: dict) -> tuple[int | None, int | None]:
Expand Down Expand Up @@ -206,11 +206,11 @@ def _get_image_info(
if image_data is None:
image_data = processed.get("medium", {})

image_url = ScienceMuseumDataIngester.check_url(image_data.get("location"))
if image_url:
url = ScienceMuseumDataIngester.check_url(image_data.get("location"))
if url:
filetype = image_data.get("format")
height, width = ScienceMuseumDataIngester._get_dimensions(image_data)
return image_url, height, width, filetype
return url, height, width, filetype

@staticmethod
def _get_first_list_value(key: str, attributes: dict) -> str | None:
Expand Down
2 changes: 1 addition & 1 deletion catalog/dags/providers/provider_api_scripts/smithsonian.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def _get_associated_images(image_list, partial_image_data: dict) -> list:
images.append(
{
**partial_image_data,
"image_url": image_url,
"url": image_url,
"foreign_identifier": foreign_identifier,
}
)
Expand Down
Loading

0 comments on commit 96fa262

Please sign in to comment.