Skip to content
This repository has been archived by the owner on Aug 4, 2023. It is now read-only.

Remove thumbnails from images #526

Merged
merged 10 commits into from
Jun 2, 2022
5 changes: 3 additions & 2 deletions openverse_catalog/dags/common/storage/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ def add_item(
foreign_landing_url: str,
image_url: str,
license_info: LicenseInfo,
thumbnail_url: Optional[str] = None,
filesize: Optional[int] = None,
filetype: Optional[str] = None,
foreign_identifier: Optional[str] = None,
Expand Down Expand Up @@ -117,7 +116,7 @@ def add_item(
image_data = {
"foreign_landing_url": foreign_landing_url,
"image_url": image_url,
"thumbnail_url": thumbnail_url,
"thumbnail_url": None,
"filesize": filesize,
"filetype": filetype,
"license_info": license_info,
Expand All @@ -144,6 +143,8 @@ def _get_image(self, **kwargs) -> Optional[Image]:
image_metadata = self.clean_media_metadata(**kwargs)
if image_metadata is None:
return None
if image_metadata["thumbnail_url"] is not None:
image_metadata["thumbnail_url"] = None
# Convert the `image_url` key used in ImageStore, TSV and
# provider API scripts into `url` key used in db
image_metadata["url"] = image_metadata.pop("image_url")
Expand Down
1 change: 0 additions & 1 deletion openverse_catalog/dags/common/tsv_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ def _process_row(tsv_row):
image_store.add_item(
foreign_landing_url=row_image.foreign_landing_url,
image_url=row_image.url,
thumbnail_url=row_image.thumbnail_url,
license_info=get_license_info(
license_url=get_license_url(row_meta_data),
license_=row_image.license_,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def _handle_object_data(data, license_url):

for image in image_info:
foreign_id = image.get("id", "")
image_url, thumbnail_url = _get_images(image)
image_url = _get_image_url(image)
if image_url is None:
continue
height, width = _get_image_sizes(image)
Expand All @@ -114,7 +114,6 @@ def _handle_object_data(data, license_url):
width=width,
height=height,
title=title,
thumbnail_url=thumbnail_url,
meta_data=metadata,
creator=creators,
)
Expand Down Expand Up @@ -171,16 +170,11 @@ def _get_creators(data):
return creator


def _get_images(image):
image_url, thumbnail_url = None, None
def _get_image_url(image):
image_url = image.get("largest_derivative_url")
if image_url:
if "http" not in image_url:
image_url = "https://" + image_url
thumbnail_url = image.get("thumbnail_url", "")
if "http" not in thumbnail_url and thumbnail_url:
thumbnail_url = "https://" + thumbnail_url
return image_url, thumbnail_url
if image_url and not image_url.startswith("http"):
image_url = "https://" + image_url
return image_url


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,6 @@ def _process_image_data(image_data, sub_providers=SUB_PROVIDERS, provider=PROVID
image_url = image_data.get("edmIsShownBy")[0]
foreign_landing_url = _get_foreign_landing_url(image_data)
foreign_id = image_data.get("id")
thumbnail_url = image_data.get("edmPreview")[0]
title = image_data.get("title")[0]
meta_data = _create_meta_data_dict(image_data)

Expand All @@ -185,7 +184,6 @@ def _process_image_data(image_data, sub_providers=SUB_PROVIDERS, provider=PROVID
foreign_landing_url=foreign_landing_url,
image_url=image_url,
license_info=license_info,
thumbnail_url=thumbnail_url,
foreign_identifier=foreign_id,
title=title,
meta_data=meta_data,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,6 @@ def _process_image_data(image_data, sub_providers=SUB_PROVIDERS, provider=PROVID
return image_store.add_item(
foreign_landing_url=foreign_landing_url,
image_url=image_url,
thumbnail_url=image_data.get("url_s"),
license_info=get_license_info(
license_=license_, license_version=license_version
),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,7 @@ def _get_batch_json(
if response_json is None:
return None
else:
results = response_json.get("results")
return results
return response_json.get("results")


def _process_item_batch(items_batch):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,18 +95,16 @@ def _get_data_for_image(object_id):
return

main_image = object_json.get("primaryImage")
main_thumbnail = object_json.get("primaryImageSmall")
other_images = object_json.get("additionalImages", [])
image_list = [(main_image, main_thumbnail)] + [(i, None) for i in other_images]
image_list = [main_image] + other_images

meta_data = _create_meta_data(object_json)

for img, thumb in image_list:
for img in image_list:
foreign_id = _build_foreign_id(object_id, img)
image_store.add_item(
foreign_landing_url=object_json.get("objectURL"),
image_url=img,
thumbnail_url=thumb,
license_info=DEFAULT_LICENSE_INFO,
foreign_identifier=foreign_id,
creator=object_json.get("artistDisplayName"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ def _handle_batch_objects(objects, landing_page=LANDING_PAGE):
height=img.get("height"),
width=img.get("width"),
license_info=license_info,
thumbnail_url=img.get("thumbnail"),
title=title,
creator=img.get("creators"),
meta_data=meta_data,
Expand All @@ -129,7 +128,6 @@ def _get_media_info(media_data):
image_id = media.get("id")
image_url, height, width = _get_image_data(media)
license_url = _get_license_url(media)
thumbnail_url = media.get("thumbnail", {}).get("uri")
if image_url is None or image_id is None or license_url is None:
continue
creators = _get_creator(media)
Expand All @@ -140,7 +138,6 @@ def _get_media_info(media_data):
"height": height,
"width": width,
"license_url": license_url,
"thumbnail": thumbnail_url,
"creators": creators,
}
)
Expand Down
14 changes: 3 additions & 11 deletions openverse_catalog/dags/providers/provider_api_scripts/nypl.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@

IMAGE_URL_DIMENSIONS = ["g", "v", "q", "w", "r"]

THUMBNAIL_DIMENSIONS = ["w", "r", "q", "f", "v", "g"]


def main():
page = 1
Expand Down Expand Up @@ -121,9 +119,7 @@ def _get_capture_details(captures=None, metadata=None, creator=None, title=None)
image_id = img.get("imageID", {}).get("$")
if image_id is None:
continue
image_url, thumbnail_url = _get_images(
img.get("imageLinks", {}).get("imageLink", [])
)
image_url = _get_image_url(img.get("imageLinks", {}).get("imageLink", []))
foreign_landing_url = img.get("itemLink", {}).get("$")
license_url = img.get("rightsStatementURI", {}).get("$")
if image_url is None or foreign_landing_url is None or license_url is None:
Expand All @@ -134,7 +130,6 @@ def _get_capture_details(captures=None, metadata=None, creator=None, title=None)
foreign_landing_url=foreign_landing_url,
image_url=image_url,
license_info=get_license_info(license_url=license_url),
thumbnail_url=thumbnail_url,
title=title,
creator=creator,
meta_data=metadata,
Expand Down Expand Up @@ -165,19 +160,16 @@ def _get_creators(creatorinfo):
return creator


def _get_images(images, image_url_dimensions=None, thumbnail_dimensions=None):
if thumbnail_dimensions is None:
thumbnail_dimensions = THUMBNAIL_DIMENSIONS
def _get_image_url(images, image_url_dimensions=None):
if image_url_dimensions is None:
image_url_dimensions = IMAGE_URL_DIMENSIONS
image_type = {
parse_qs(urlparse(img.get("$")).query)["t"][0]: img.get("$") for img in images
}

image_url = _get_preferred_image(image_type, image_url_dimensions)
thumbnail_url = _get_preferred_image(image_type, thumbnail_dimensions)

return image_url, thumbnail_url
return image_url


def _get_preferred_image(image_type, dimension_list):
Expand Down
59 changes: 17 additions & 42 deletions openverse_catalog/dags/providers/provider_api_scripts/phylopic.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,24 +88,7 @@ def _add_data_to_buffer(**kwargs):
if id_ is not None:
details = _get_meta_data(id_)
if details is not None:
kwargs = _create_args(details, id_)
image_store.add_item(**kwargs)


def _create_args(details, id_):
args = {
"foreign_landing_url": details[1],
"image_url": details[2],
"thumbnail_url": details[3],
"license_info": get_license_info(license_url=details[6]),
"width": details[4],
"height": details[5],
"creator": details[7],
"title": details[8],
"meta_data": details[9],
"foreign_identifier": id_,
}
return args
image_store.add_item(**details)


def _get_total_images():
Expand Down Expand Up @@ -185,23 +168,23 @@ def _get_meta_data(_uuid):
result
)

img_url, width, height, thumbnail = _get_image_info(result, _uuid)
foreign_id = img_url
img_url, width, height = _get_image_info(result, _uuid)

if img_url is None:
return None

return [
foreign_id,
foreign_url,
img_url,
thumbnail,
str(width),
str(height),
license_url,
creator,
title,
meta_data,
]
details = {
obulat marked this conversation as resolved.
Show resolved Hide resolved
"foreign_identifier": _uuid,
"foreign_landing_url": foreign_url,
"image_url": img_url,
"license_info": get_license_info(license_url=license_url),
"width": str(width),
"height": str(height),
"creator": creator,
"title": title,
"meta_data": meta_data,
}
return details


def _get_creator_details(result):
Expand Down Expand Up @@ -240,34 +223,26 @@ def _get_taxa_details(result):
def _get_image_info(result, _uuid):
base_url = "http://phylopic.org"
img_url = ""
thumbnail = ""
width = ""
height = ""

image_info = result.get("pngFiles")
img = []
thb = []
if image_info:
img = list(filter(lambda x: (int(str(x.get("width", "0"))) >= 257), image_info))
img = sorted(img, key=lambda x: x["width"], reverse=True)
thb = list(filter(lambda x: str(x.get("width", "")) == "256", image_info))

if len(img) > 0:
img_url = img[0].get("url")
img_url = f"{base_url}{img_url}"
width = img[0].get("width")
height = img[0].get("height")

if len(thb) > 0:
thumbnail_info = thb[0].get("url")
if thumbnail_info is not None:
thumbnail = f"{base_url}{thumbnail_info}"

if img_url == "":
logging.warning(f"Image not detected in url: {base_url}/image/{_uuid}")
return None, None, None, None
return None, None, None
else:
return img_url, width, height, thumbnail
return img_url, width, height


def _compute_date_range(date_start: str, days: int = DEFAULT_PROCESS_DAYS) -> str:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,7 @@ def _get_image_properties(image, foreign_url):
query_params = urlparse(img_url)
width = parse_qs(query_params.query).get("w", [])[0]
height = parse_qs(query_params.query).get("h", [])[0]
thumbnail = image.get("image_400", "")
return [img_url, width, height, thumbnail]
return [img_url, width, height]
else:
logger.warning(f"Image not detected in URL: {foreign_url}")
return [None, None, None, None]
Expand Down Expand Up @@ -120,7 +119,7 @@ def _process_image_data(image):
foreign_id, foreign_url = _get_foreign_id_url(image)
if not foreign_url:
return None
img_url, width, height, thumbnail = _get_image_properties(image, foreign_url)
img_url, width, height = _get_image_properties(image, foreign_url)
if not img_url:
return None
title, owner = _get_title_owner(image)
Expand All @@ -143,7 +142,6 @@ def _process_image_data(image):
meta_data=meta_data,
raw_tags=tags,
creator=owner,
thumbnail_url=thumbnail,
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,15 +145,13 @@ def _handle_object_data(batch_data):
license_, version = license_version.lower().split(" ")
license_ = license_.replace("cc-", "")
license_info = get_license_info(license_=license_, license_version=version)
thumbnail_url = _get_thumbnail_url(processed)
image_count = image_store.add_item(
foreign_identifier=foreign_id,
foreign_landing_url=foreign_landing_url,
image_url=image_url,
height=height,
width=width,
license_info=license_info,
thumbnail_url=thumbnail_url,
creator=creator,
title=title,
meta_data=metadata,
Expand Down Expand Up @@ -188,19 +186,6 @@ def _get_image_info(processed):
return image, height, width


def _get_thumbnail_url(processed):
if processed.get("large_thumbnail"):
image = processed.get("large_thumbnail").get("location")
elif processed.get("medium_thumbnail"):
image = processed.get("medium_thumbnail").get("location")
elif processed.get("small_thumbnail"):
image = processed.get("small_thumbnail").get("location")
else:
image = None
thumbnail_url = check_url(image)
return thumbnail_url


def check_url(image_url):
base_url = "https://coimages.sciencemuseumgroup.org.uk/images/"
if image_url:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,6 @@ def _process_image_list(
total_images = image_store.add_item(
foreign_landing_url=foreign_landing_url,
image_url=image_data.get("content"),
thumbnail_url=image_data.get("thumbnail"),
license_info=LicenseInfo(
"cc0",
"1.0",
Expand Down
Loading