From 25e18fa74e102b5473c7206daeb0900abda4184c Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Fri, 25 Jun 2021 15:23:35 +0300 Subject: [PATCH] Extract MediaStorage entity as parent to ImageStore (#83) --- .../dags/common/storage/image.py | 400 ++------ .../dags/common/storage/media.py | 312 ++++++ .../dags/common/storage/test_image.py | 300 ++---- .../dags/common/storage/test_media.py | 912 ++++++++++++++++++ .../provider_api_scripts/brooklyn_museum.py | 4 +- .../cleveland_museum_of_art.py | 22 +- .../dags/provider_api_scripts/nypl.py | 4 +- .../provider_api_scripts/wikimedia_commons.py | 4 +- .../dags/util/loader/ingestion_column.py | 4 +- .../dags/util/pg_cleaner.py | 6 +- .../dags/util/test_pg_cleaner.py | 4 +- 11 files changed, 1377 insertions(+), 595 deletions(-) create mode 100644 src/cc_catalog_airflow/dags/common/storage/media.py create mode 100644 src/cc_catalog_airflow/dags/common/storage/test_media.py diff --git a/src/cc_catalog_airflow/dags/common/storage/image.py b/src/cc_catalog_airflow/dags/common/storage/image.py index 4b99d87da..066fb5e1d 100644 --- a/src/cc_catalog_airflow/dags/common/storage/image.py +++ b/src/cc_catalog_airflow/dags/common/storage/image.py @@ -1,15 +1,13 @@ from collections import namedtuple -from datetime import datetime import logging -import os +from typing import Optional, Dict, Union -from common.licenses import licenses -from common.storage import util from common.storage import columns +from common.storage.media import MediaStore logger = logging.getLogger(__name__) -_IMAGE_TSV_COLUMNS = [ +IMAGE_TSV_COLUMNS = [ # The order of this list maps to the order of the columns in the TSV. columns.StringColumn( name='foreign_identifier', required=False, size=3000, truncate=False @@ -69,37 +67,10 @@ ), ] -Image = namedtuple( - 'Image', - [c.NAME for c in _IMAGE_TSV_COLUMNS] -) +Image = namedtuple("Image", [c.NAME for c in IMAGE_TSV_COLUMNS]) -# Filter out tags that exactly match these terms. All terms should be -# lowercase. -TAG_BLACKLIST = { - 'no person', - 'squareformat' -} -# Filter out tags that contain the following terms. All entrees should be -# lowercase. -TAG_CONTAINS_BLACKLIST = { - 'flickriosapp', - 'uploaded', - ':', - '=', - 'cc0', - 'by', - 'by-nc', - 'by-nd', - 'by-sa', - 'by-nc-nd', - 'by-nc-sa', - 'pdm' -} - - -class ImageStore: +class ImageStore(MediaStore): """ A class that stores image information from a given provider. @@ -113,55 +84,49 @@ class ImageStore: """ def __init__( - self, - provider=None, - output_file=None, - output_dir=None, - buffer_length=100 + self, + provider=None, + output_file=None, + output_dir=None, + buffer_length=100, + media_type="image", + tsv_columns=None, ): - logger.info(f'Initialized with provider {provider}') - self._image_buffer = [] - self._total_images = 0 - self._PROVIDER = provider - self._BUFFER_LENGTH = buffer_length - self._NOW = datetime.now() - self._OUTPUT_PATH = self._initialize_output_path( - output_dir, - output_file, - provider, + super().__init__( + provider, output_file, output_dir, buffer_length, media_type ) + self.columns = IMAGE_TSV_COLUMNS \ + if tsv_columns is None else tsv_columns def add_item( - self, - foreign_landing_url=None, - image_url=None, - thumbnail_url=None, - license_url=None, - license_=None, - license_version=None, - foreign_identifier=None, - width=None, - height=None, - creator=None, - creator_url=None, - title=None, - meta_data=None, - raw_tags=None, - watermarked='f', - source=None, - ingestion_type='commoncrawl' + self, + foreign_landing_url: str, + image_url: str, + thumbnail_url: Optional[str] = None, + license_url: Optional[str] = None, + license_: Optional[str] = None, + license_version: Optional[str] = None, + foreign_identifier: Optional[str] = None, + width: Optional[int] = None, + height: Optional[int] = None, + creator: Optional[str] = None, + creator_url: Optional[str] = None, + title: Optional[str] = None, + meta_data: Optional[Union[Dict, str]] = None, + raw_tags=None, + watermarked: Optional[str] = "f", + source: Optional[str] = None, + ingestion_type: Optional[str] = 'commoncrawl', ): """ Add information for a single image to the ImageStore. Required Arguments: - foreign_landing_url: URL of page where the image lives on the source website. image_url: Direct link to the image file Semi-Required Arguments - license_url: URL of the license for the image on the Creative Commons website. license_: String representation of a Creative Commons @@ -169,7 +134,7 @@ def add_item( `common.license.constants.get_license_path_map()` license_version: Version of the given license. In the case of the `publicdomain` license, which has no - version, one shoud pass + version, one should pass `common.license.constants.NO_VERSION` here. Note on license arguments: These are 'semi-required' in that @@ -178,7 +143,6 @@ def add_item( image data will be discarded. Optional Arguments: - thumbnail_url: Direct link to a thumbnail-sized version of the image foreign_identifier: Unique identifier for the image on the @@ -208,208 +172,37 @@ def add_item( through an api - 'provider_api' or using the commoncrawl database - 'commoncrawl' """ - image = self._get_image( - foreign_landing_url=foreign_landing_url, - image_url=image_url, - thumbnail_url=thumbnail_url, - license_url=license_url, - license_=license_, - license_version=license_version, - foreign_identifier=foreign_identifier, - width=width, - height=height, - creator=creator, - creator_url=creator_url, - title=title, - meta_data=meta_data, - raw_tags=raw_tags, - watermarked=watermarked, - source=source, - ingestion_type=ingestion_type, - ) - tsv_row = self._create_tsv_row(image) - if tsv_row: - self._image_buffer.append(tsv_row) - self._total_images += 1 - if len(self._image_buffer) >= self._BUFFER_LENGTH: - self._flush_buffer() - - return self._total_images - - def commit(self): - """Writes all remaining images in the buffer to disk.""" - self._flush_buffer() - - return self._total_images - - def _initialize_output_path(self, output_dir, output_file, provider): - if output_dir is None: - logger.info( - 'No given output directory. ' - 'Using OUTPUT_DIR from environment.' - ) - output_dir = os.getenv('OUTPUT_DIR') - if output_dir is None: - logger.warning( - 'OUTPUT_DIR is not set in the enivronment. ' - 'Output will go to /tmp.' - ) - output_dir = '/tmp' - - if output_file is not None: - output_file = str(output_file) - else: - output_file = ( - f'{provider}_{datetime.strftime(self._NOW, "%Y%m%d%H%M%S")}' - f'.tsv' - ) - - output_path = os.path.join(output_dir, output_file) - logger.info(f'Output path: {output_path}') - return output_path - - def _get_total_images(self): - return self._total_images - - """Get total images for directly using in scripts.""" - total_images = property(_get_total_images) - - def _get_image( - self, - foreign_identifier, - foreign_landing_url, - image_url, - thumbnail_url, - width, - height, - license_url, - license_, - license_version, - creator, - creator_url, - title, - meta_data, - raw_tags, - watermarked, - source, - ingestion_type - ): - valid_license_info = licenses.get_license_info( - license_url=license_url, - license_=license_, - license_version=license_version - ) - source = util.get_source(source, self._PROVIDER) - meta_data = self._enrich_meta_data( - meta_data, - license_url=valid_license_info.url, - raw_license_url=license_url - ) - tags = self._enrich_tags(raw_tags) - - return Image( - foreign_identifier=foreign_identifier, - foreign_landing_url=foreign_landing_url, - image_url=image_url, - thumbnail_url=thumbnail_url, - license_=valid_license_info.license, - license_version=valid_license_info.version, - width=width, - height=height, - filesize=None, - creator=creator, - creator_url=creator_url, - title=title, - meta_data=meta_data, - tags=tags, - watermarked=watermarked, - provider=self._PROVIDER, - source=source, - ingestion_type=ingestion_type - ) - - def _create_tsv_row( - self, - image, - columns=_IMAGE_TSV_COLUMNS - ): - row_length = len(columns) - prepared_strings = [ - columns[i].prepare_string(image[i]) for i in range(row_length) - ] - logger.debug(f'Prepared strings list:\n{prepared_strings}') - for i in range(row_length): - if columns[i].REQUIRED and prepared_strings[i] is None: - logger.warning(f'Row missing required {columns[i].NAME}') - return None - else: - return '\t'.join( - [s if s is not None else '\\N' for s in prepared_strings] - ) + '\n' - - def _flush_buffer(self): - buffer_length = len(self._image_buffer) - if buffer_length > 0: - logger.info( - f'Writing {buffer_length} lines from buffer to disk.' - ) - with open(self._OUTPUT_PATH, 'a') as f: - f.writelines(self._image_buffer) - self._image_buffer = [] - logger.debug( - f'Total Images Processed so far: {self._total_images}' - ) - else: - logger.debug('Empty buffer! Nothing to write.') - return buffer_length - - def _tag_blacklisted(self, tag): - """ - Tag is banned or contains a banned substring. - :param tag: the tag to be verified against the blacklist - :return: true if tag is blacklisted, else returns false - """ - if type(tag) == dict: # check if the tag is already enriched - tag = tag.get('name') - if tag in TAG_BLACKLIST: - return True - for blacklisted_substring in TAG_CONTAINS_BLACKLIST: - if blacklisted_substring in tag: - return True - return False - - def _enrich_meta_data(self, meta_data, license_url, raw_license_url): - if type(meta_data) != dict: - logger.debug( - f'`meta_data` is not a dictionary: {meta_data}' - ) - enriched_meta_data = { - 'license_url': license_url, 'raw_license_url': raw_license_url - } - else: - enriched_meta_data = meta_data - enriched_meta_data.update( - license_url=license_url, raw_license_url=raw_license_url - ) - return enriched_meta_data - - def _enrich_tags(self, raw_tags): - if type(raw_tags) != list: - logger.debug('`tags` is not a list.') + image_data = { + 'foreign_landing_url': foreign_landing_url, + 'image_url': image_url, + 'thumbnail_url': thumbnail_url, + 'license_': license_, + 'license_url': license_url, + 'license_version': license_version, + 'foreign_identifier': foreign_identifier, + 'width': width, + 'height': height, + 'creator': creator, + 'creator_url': creator_url, + 'title': title, + 'meta_data': meta_data, + 'raw_tags': raw_tags, + 'watermarked': watermarked, + 'source': source, + 'ingestion_type': ingestion_type + } + image = self._get_image(**image_data) + if image is not None: + self.save_item(image) + return self.total_items + + def _get_image(self, **kwargs) -> Optional[Image]: + """Validates image information and returns Image namedtuple""" + image_metadata = self.clean_media_metadata(**kwargs) + if image_metadata is None: return None - else: - return [ - self._format_raw_tag(tag) for tag in raw_tags - if not self._tag_blacklisted(tag) - ] - def _format_raw_tag(self, tag): - if type(tag) == dict and tag.get('name') and tag.get('provider'): - logger.debug(f'Tag already enriched: {tag}') - return tag - else: - logger.debug(f'Enriching tag: {tag}') - return {'name': tag, 'provider': self._PROVIDER} + return Image(**image_metadata) class MockImageStore(ImageStore): @@ -426,64 +219,13 @@ class MockImageStore(ImageStore): """ def __init__( - self, - provider=None, - output_file=None, - output_dir=None, - buffer_length=100, - license_info=None + self, + provider=None, + output_file=None, + output_dir=None, + buffer_length=100, + license_info=None, ): - logger.info(f'Initialized with provider {provider}') + logger.info(f"Initialized with provider {provider}") super().__init__(provider=provider) self.license_info = license_info - - def _get_image( - self, - foreign_identifier, - foreign_landing_url, - image_url, - thumbnail_url, - width, - height, - license_url, - license_, - license_version, - creator, - creator_url, - title, - meta_data, - raw_tags, - watermarked, - source, - ingestion_type, - ): - valid_license_info = self.license_info - - source = util.get_source(source, self._PROVIDER) - meta_data = self._enrich_meta_data( - meta_data, - license_url=valid_license_info.url, - raw_license_url=license_url - ) - tags = self._enrich_tags(raw_tags) - - return Image( - foreign_identifier=foreign_identifier, - foreign_landing_url=foreign_landing_url, - image_url=image_url, - thumbnail_url=thumbnail_url, - license_=valid_license_info.license, - license_version=valid_license_info.version, - width=width, - height=height, - filesize=None, - creator=creator, - creator_url=creator_url, - title=title, - meta_data=meta_data, - tags=tags, - watermarked=watermarked, - provider=self._PROVIDER, - source=source, - ingestion_type=ingestion_type, - ) diff --git a/src/cc_catalog_airflow/dags/common/storage/media.py b/src/cc_catalog_airflow/dags/common/storage/media.py new file mode 100644 index 000000000..020ff5d4c --- /dev/null +++ b/src/cc_catalog_airflow/dags/common/storage/media.py @@ -0,0 +1,312 @@ +import abc +from datetime import datetime +import logging +import os +from typing import Optional, Union + +from common.storage import util +from common.licenses import licenses + +logger = logging.getLogger(__name__) + +# Filter out tags that exactly match these terms. All terms should be +# lowercase. +TAG_BLACKLIST = {"no person", "squareformat"} + +# Filter out tags that contain the following terms. All entrees should be +# lowercase. +TAG_CONTAINS_BLACKLIST = { + "flickriosapp", + "uploaded", + ":", + "=", + "cc0", + "by", + "by-nc", + "by-nd", + "by-sa", + "by-nc-nd", + "by-nc-sa", + "pdm", +} + + +class MediaStore(metaclass=abc.ABCMeta): + """ + An abstract base class that stores media information from a given provider. + + Optional init arguments: + provider: String marking the provider in the `media` + (`image`, `audio` etc) table of the DB. + output_file: String giving a temporary .tsv filename (*not* the + full path) where the media info should be stored. + output_dir: String giving a path where `output_file` should be placed. + buffer_length: Integer giving the maximum number of media information rows + to store in memory before writing them to disk. + """ + + def __init__( + self, + provider: Optional[str] = None, + output_file: Optional[str] = None, + output_dir: Optional[str] = None, + buffer_length: int = 100, + media_type: Optional[str] = "generic", + ): + logger.info(f"Initialized {media_type} MediaStore" + f" with provider {provider}") + self.media_type = media_type + self._media_buffer = [] + self._total_items = 0 + self._PROVIDER = provider + self._BUFFER_LENGTH = buffer_length + self._NOW = datetime.now() + self._OUTPUT_PATH = self._initialize_output_path( + output_dir, + output_file, + provider, + ) + self.columns = None + + def save_item(self, media) -> None: + """ + Appends item data to the buffer as a tsv row, + only if data is valid. + + Args: + media: a namedtuple with validated media metadata + """ + tsv_row = self._create_tsv_row(media) + if tsv_row: + self._media_buffer.append(tsv_row) + self._total_items += 1 + if len(self._media_buffer) >= self._BUFFER_LENGTH: + self._flush_buffer() + + @abc.abstractmethod + def add_item(self, **kwargs): + """ + Abstract method to clean the item data and add it to the store + """ + pass + + @staticmethod + def validate_license_info(media_data) -> Optional[dict]: + """ + Replaces license properties in media data with validated + values. Generates license information based on `license_url`, or + pair of `license_` and `license_version` properties of + media_data dictionary. + Adds `raw_license_url` if the `license_url` has been rewritten + either because it was invalid, or to add 'https:', + or trailing '/' at the end. + Returns `None` if license data is invalid. + """ + license_url = media_data.pop('license_url', None) + license_ = media_data.pop('license_', None) + license_version = media_data.pop('license_version', None) + + valid_license_info = licenses.get_license_info( + license_url=license_url, + license_=license_, + license_version=license_version + ) + + if valid_license_info.license is None: + logger.debug( + f"Invalid image license." + f" URL: <{license_url}>," + f" license: {license_}," + f" version: {license_version}") + return None + media_data.update({ + 'license_url': valid_license_info.url, + 'license_': valid_license_info.license, + 'license_version': valid_license_info.version, + }) + if valid_license_info.url != license_url: + media_data['raw_license_url'] = license_url + + return media_data + + def clean_media_metadata(self, **media_data) -> Optional[dict]: + """ + Cleans the base media metadata common for all media types. + Enriches `meta_data` and `tags`. + Returns a dictionary: media_type-specific fields are untouched, + and for common metadata we: + - remove `license_url` and `raw_license_url`, + - validate `license_` and `license_version`, + - enrich `metadata` and `tags`, + - remove `raw_tags` are removed, + - validate `source`, + - add `provider`, + - add `filesize` (with value of None) + + Returns None if license is invalid + """ + media_data = self.validate_license_info(media_data) + if media_data is None: + return None + + media_data['source'] = util.get_source( + media_data.get('source'), + self._PROVIDER + ) + media_data['tags'] = self._enrich_tags( + media_data.pop('raw_tags', None) + ) + media_data['meta_data'] = self._enrich_meta_data( + media_data.pop('meta_data', None), + media_data.pop('license_url', None), + media_data.pop('raw_license_url', None), + ) + + media_data['provider'] = self._PROVIDER + media_data['filesize'] = None + return media_data + + def commit(self): + """Writes all remaining media items in the buffer to disk.""" + self._flush_buffer() + return self.total_items + + def _initialize_output_path( + self, + output_dir: Optional[str], + output_file: Optional[str], + provider: str, + ) -> str: + """Creates the path for the tsv file. + If output_dir and output_file ar not given, + the following filename is used: + `/tmp/{provider_name}_{media_type}_{timestamp}.tsv` + + Returns: + Path of the tsv file to write media data pulled from providers + """ + if output_dir is None: + logger.info("No given output directory. " + "Using OUTPUT_DIR from environment.") + output_dir = os.getenv("OUTPUT_DIR") + if output_dir is None: + logger.warning( + "OUTPUT_DIR is not set in the environment. " + "Output will go to /tmp." + ) + output_dir = "/tmp" + + if output_file is not None: + output_file = str(output_file) + else: + datetime_string = datetime.strftime( + self._NOW, '%Y%m%d%H%M%S') + output_file = ( + f"{provider}_{self.media_type}" + f"_{datetime_string}.tsv" + ) + + output_path = os.path.join(output_dir, output_file) + logger.info(f"Output path: {output_path}") + return output_path + + @property + def total_items(self): + """Get total items for directly using in scripts.""" + return self._total_items + + def _create_tsv_row(self, item): + row_length = len(self.columns) + prepared_strings = [ + self.columns[i].prepare_string(item[i]) for i in range(row_length) + ] + logger.debug(f"Prepared strings list:\n{prepared_strings}") + for i in range(row_length): + if self.columns[i].REQUIRED and prepared_strings[i] is None: + logger.warning(f"Row missing required {self.columns[i].NAME}") + return None + else: + return ( + "\t".join( + [s if s is not None else "\\N" + for s in prepared_strings]) + + "\n" + ) + + def _flush_buffer(self) -> int: + buffer_length = len(self._media_buffer) + if buffer_length > 0: + logger.info(f"Writing {buffer_length} lines from buffer to disk.") + with open(self._OUTPUT_PATH, "a") as f: + f.writelines(self._media_buffer) + self._media_buffer = [] + logger.debug( + f"Total Media Items Processed so far: {self._total_items}" + ) + else: + logger.debug("Empty buffer! Nothing to write.") + return buffer_length + + @staticmethod + def _tag_blacklisted(tag: Union[str, dict]) -> bool: + """ + Tag is banned or contains a banned substring. + :param tag: the tag to be verified against the blacklist + :return: true if tag is blacklisted, else returns false + """ + if type(tag) == dict: # check if the tag is already enriched + tag = tag.get("name") + if tag in TAG_BLACKLIST: + return True + for blacklisted_substring in TAG_CONTAINS_BLACKLIST: + if blacklisted_substring in tag: + return True + return False + + @staticmethod + def _enrich_meta_data( + meta_data, license_url, raw_license_url) -> dict: + """ + Makes sure that meta_data is a dictionary, and contains + license_url and raw_license_url + """ + if type(meta_data) != dict: + logger.debug(f"`meta_data` is not a dictionary: {meta_data}") + enriched_meta_data = { + "license_url": license_url, + "raw_license_url": raw_license_url, + } + else: + enriched_meta_data = meta_data + enriched_meta_data.update( + license_url=license_url, raw_license_url=raw_license_url + ) + return enriched_meta_data + + def _enrich_tags(self, raw_tags) -> Optional[list]: + """Takes a list of tags and adds provider information to them + + Args: + raw_tags: List of strings or dictionaries + + Returns: + A list of 'enriched' tags: + {"name": "tag_name", "provider": self._PROVIDER} + """ + if type(raw_tags) != list: + logger.debug("`tags` is not a list.") + return None + else: + return [ + self._format_raw_tag(tag) + for tag in raw_tags + if not self._tag_blacklisted(tag) + ] + + def _format_raw_tag(self, tag): + if type(tag) == dict and tag.get("name") and tag.get("provider"): + logger.debug(f"Tag already enriched: {tag}") + return tag + else: + logger.debug(f"Enriching tag: {tag}") + return {"name": tag, "provider": self._PROVIDER} diff --git a/src/cc_catalog_airflow/dags/common/storage/test_image.py b/src/cc_catalog_airflow/dags/common/storage/test_image.py index d52850998..7539e0536 100644 --- a/src/cc_catalog_airflow/dags/common/storage/test_image.py +++ b/src/cc_catalog_airflow/dags/common/storage/test_image.py @@ -1,10 +1,15 @@ import logging +from unittest.mock import patch + import requests import pytest import tldextract +from common.licenses import licenses from common.storage import image +from common.storage import util + logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s: %(message)s', @@ -13,7 +18,7 @@ logger = logging.getLogger(__name__) # This avoids needing the internet for testing. -image.licenses.urls.tldextract.extract = tldextract.TLDExtract( +licenses.urls.tldextract.extract = tldextract.TLDExtract( suffix_list_urls=None ) image.columns.urls.tldextract.extract = tldextract.TLDExtract( @@ -31,7 +36,7 @@ def mock_rewriter(monkeypatch): def mock_rewrite_redirected_url(url_string): return url_string monkeypatch.setattr( - image.licenses.urls, + licenses.urls, 'rewrite_redirected_url', mock_rewrite_redirected_url, ) @@ -41,7 +46,7 @@ def mock_rewrite_redirected_url(url_string): def get_good(monkeypatch): def mock_get(url, timeout=60): return requests.Response() - monkeypatch.setattr(image.licenses.urls.requests, 'get', mock_get) + monkeypatch.setattr(licenses.urls.requests, 'get', mock_get) def test_ImageStore_uses_OUTPUT_DIR_variable( @@ -80,7 +85,7 @@ def test_ImageStore_add_item_adds_realistic_image_to_buffer( image_url='https://images.org/image01.jpg', license_url=license_url, ) - assert len(image_store._image_buffer) == 1 + assert len(image_store._media_buffer) == 1 def test_ImageStore_add_item_adds_multiple_images_to_buffer( @@ -107,7 +112,7 @@ def test_ImageStore_add_item_adds_multiple_images_to_buffer( image_url='https://images.org/image04.jpg', license_url='https://creativecommons.org/publicdomain/zero/1.0/' ) - assert len(image_store._image_buffer) == 4 + assert len(image_store._media_buffer) == 4 def test_ImageStore_add_item_flushes_buffer( @@ -145,7 +150,7 @@ def test_ImageStore_add_item_flushes_buffer( image_url='https://images.org/image04.jpg', license_url='https://creativecommons.org/publicdomain/zero/1.0/' ) - assert len(image_store._image_buffer) == 1 + assert len(image_store._media_buffer) == 1 with open(tmp_path_full) as f: lines = f.read().split('\n') assert len(lines) == 4 # recall the last '\n' will create an empty line. @@ -173,7 +178,7 @@ def test_ImageStore_produces_correct_total_images(mock_rewriter, setup_env): image_url='https://images.org/image03.jpg', license_url='https://creativecommons.org/publicdomain/zero/1.0/' ) - assert image_store.total_images == 3 + assert image_store.total_items == 3 def test_ImageStore_get_image_places_given_args( @@ -183,12 +188,12 @@ def test_ImageStore_get_image_places_given_args( image_store = image.ImageStore(provider='testing_provider') args_dict = { 'foreign_landing_url': 'https://landing_page.com', - 'image_url': 'http://imageurl.com', - 'license_': 'testlicense', + 'image_url': 'https://imageurl.com', + 'license_': 'by', 'license_version': '1.0', 'license_url': None, 'foreign_identifier': 'foreign_id', - 'thumbnail_url': 'http://thumbnail.com', + 'thumbnail_url': 'https://thumbnail.com', 'width': 200, 'height': 500, 'creator': 'tyler', @@ -201,20 +206,10 @@ def test_ImageStore_get_image_places_given_args( 'ingestion_type': 'provider_api', } - def mock_license_chooser(license_url, license_, license_version): - return image.licenses.LicenseInfo( - license_, license_version, license_url - ) - monkeypatch.setattr( - image.licenses, - 'get_license_info', - mock_license_chooser - ) - def mock_get_source(source, provider): return source monkeypatch.setattr( - image.util, + util, 'get_source', mock_get_source ) @@ -235,223 +230,44 @@ def mock_enrich_tags(tags): assert actual_image == image.Image(**args_dict) -def test_ImageStore_get_image_calls_license_chooser( - monkeypatch, - setup_env, -): - image_store = image.ImageStore() - - def mock_license_chooser(license_url, license_, license_version): - return image.licenses.LicenseInfo( - 'diff_license', None, license_url - ) - monkeypatch.setattr( - image.licenses, - 'get_license_info', - mock_license_chooser - ) - - actual_image = image_store._get_image( - license_url='https://license/url', - license_='license', - license_version='1.5', - foreign_landing_url=None, - image_url=None, - thumbnail_url=None, - foreign_identifier=None, - width=None, - height=None, - creator=None, - creator_url=None, - title=None, - meta_data=None, - raw_tags=None, - watermarked=None, - source=None, - ingestion_type='provider_api', - ) - assert actual_image.license_ == 'diff_license' - - -def test_ImageStore_get_image_gets_source( - monkeypatch, - setup_env, -): - image_store = image.ImageStore() - - def mock_get_source(source, provider): - return 'diff_source' - monkeypatch.setattr(image.util, 'get_source', mock_get_source) - - actual_image = image_store._get_image( - license_url='https://license/url', - license_='license', - license_version='1.5', - foreign_landing_url=None, - image_url=None, - thumbnail_url=None, - foreign_identifier=None, - width=None, - height=None, - creator=None, - creator_url=None, - title=None, - meta_data=None, - raw_tags=None, - watermarked=None, - source=None, - ingestion_type='provider_api', - ) - assert actual_image.source == 'diff_source' - - -def test_ImageStore_get_image_replaces_non_dict_meta_data_with_no_license_url( - setup_env, -): - image_store = image.ImageStore() - - actual_image = image_store._get_image( - license_url=None, - license_='license', - license_version='1.5', - foreign_landing_url=None, - image_url=None, - thumbnail_url=None, - foreign_identifier=None, - width=None, - height=None, - creator=None, - creator_url=None, - title=None, - meta_data='notadict', - raw_tags=None, - watermarked=None, - source=None, - ingestion_type='provider_api', - ) - assert actual_image.meta_data == { - 'license_url': None, 'raw_license_url': None - } - - -def test_ImageStore_get_image_creates_meta_data_with_valid_license_url( +def test_ImageStore_add_item_adds_valid_license_url_to_dict_meta_data( monkeypatch, setup_env ): - def mock_license_chooser(license_url, license_, license_version): - return image.licenses.LicenseInfo( - license_, license_version, license_url - ) - monkeypatch.setattr( - image.licenses, - 'get_license_info', - mock_license_chooser - ) - license_url = 'https://my.license.url' image_store = image.ImageStore() - actual_image = image_store._get_image( - license_url=license_url, - license_='license', - license_version='1.5', - foreign_landing_url=None, - image_url=None, - thumbnail_url=None, - foreign_identifier=None, - width=None, - height=None, - creator=None, - creator_url=None, - title=None, - meta_data=None, - raw_tags=None, - watermarked=None, - source=None, - ingestion_type='provider_api', - ) - assert actual_image.meta_data == { - 'license_url': license_url, 'raw_license_url': license_url - } - - -def test_ImageStore_get_image_adds_valid_license_url_to_dict_meta_data( - monkeypatch, setup_env -): - def mock_license_chooser(license_url, license_, license_version): - return image.licenses.LicenseInfo( - license_, license_version, license_url + def item_saver(arg): + pass + + with patch.object( + image_store, + 'save_item', + side_effect=item_saver) as mock_save: + image_store.add_item( + license_url='https://license/url', + license_='by', + license_version='4.0', + foreign_landing_url='', + image_url='', + thumbnail_url=None, + foreign_identifier=None, + width=None, + height=None, + creator=None, + creator_url=None, + title=None, + meta_data={'key1': 'val1'}, + raw_tags=None, + watermarked=None, + source=None, + ingestion_type='provider_api', ) - monkeypatch.setattr( - image.licenses, - 'get_license_info', - mock_license_chooser - ) - image_store = image.ImageStore() + actual_image = mock_save.call_args[0][0] - actual_image = image_store._get_image( - license_url='https://license/url', - license_='license', - license_version='1.5', - foreign_landing_url=None, - image_url=None, - thumbnail_url=None, - foreign_identifier=None, - width=None, - height=None, - creator=None, - creator_url=None, - title=None, - meta_data={'key1': 'val1'}, - raw_tags=None, - watermarked=None, - source=None, - ingestion_type='provider_api', - ) - assert actual_image.meta_data == { - 'key1': 'val1', - 'license_url': 'https://license/url', - 'raw_license_url': 'https://license/url' - } - - -def test_ImageStore_get_image_fixes_invalid_license_url( - monkeypatch, setup_env -): - image_store = image.ImageStore() - original_url = 'https://license/url', - updated_url = 'https://updatedurl.com' - - def mock_license_chooser(license_url, license_, license_version): - return image.licenses.LicenseInfo( - license_, license_version, updated_url - ) - monkeypatch.setattr( - image.licenses, - 'get_license_info', - mock_license_chooser - ) - - actual_image = image_store._get_image( - license_url=original_url, - license_='license', - license_version='1.5', - foreign_landing_url=None, - image_url=None, - thumbnail_url=None, - foreign_identifier=None, - width=None, - height=None, - creator=None, - creator_url=None, - title=None, - meta_data={}, - raw_tags=None, - watermarked=None, - source=None, - ingestion_type='provider_api', - ) - assert actual_image.meta_data == { - 'license_url': updated_url, 'raw_license_url': original_url - } + assert actual_image.meta_data == { + 'key1': 'val1', + 'license_url': 'https://creativecommons.org/licenses/by/4.0/', + 'raw_license_url': 'https://license/url' + } def test_ImageStore_get_image_enriches_singleton_tags( @@ -461,8 +277,8 @@ def test_ImageStore_get_image_enriches_singleton_tags( actual_image = image_store._get_image( license_url='https://license/url', - license_='license', - license_version='1.5', + license_='by-sa', + license_version='4.0', foreign_landing_url=None, image_url=None, thumbnail_url=None, @@ -500,8 +316,8 @@ def test_ImageStore_get_image_tag_blacklist( actual_image = image_store._get_image( license_url='https://license/url', - license_='license', - license_version='1.5', + license_='by', + license_version='4.0', foreign_landing_url=None, image_url=None, thumbnail_url=None, @@ -529,8 +345,8 @@ def test_ImageStore_get_image_enriches_multiple_tags( image_store = image.ImageStore('test_provider') actual_image = image_store._get_image( license_url='https://license/url', - license_='license', - license_version='1.5', + license_='by', + license_version='4.0', foreign_landing_url=None, image_url=None, thumbnail_url=None, @@ -566,8 +382,8 @@ def test_ImageStore_get_image_leaves_preenriched_tags( actual_image = image_store._get_image( license_url='https://license/url', - license_='license', - license_version='1.5', + license_='by', + license_version='4.0', foreign_landing_url=None, image_url=None, thumbnail_url=None, @@ -595,8 +411,8 @@ def test_ImageStore_get_image_nones_nonlist_tags( actual_image = image_store._get_image( license_url='https://license/url', - license_='license', - license_version='1.5', + license_='by', + license_version='4.0', foreign_landing_url=None, image_url=None, thumbnail_url=None, diff --git a/src/cc_catalog_airflow/dags/common/storage/test_media.py b/src/cc_catalog_airflow/dags/common/storage/test_media.py new file mode 100644 index 000000000..70c45f557 --- /dev/null +++ b/src/cc_catalog_airflow/dags/common/storage/test_media.py @@ -0,0 +1,912 @@ +""" +MediaStore is an abstract class, so to test it we +use one of the inheriting classes, ImageStore +""" +import logging +from unittest.mock import patch + +import requests + +import pytest +import tldextract + +from common.licenses import licenses +from common.storage import image + +logging.basicConfig( + format='%(asctime)s - %(name)s - %(levelname)s: %(message)s', + level=logging.DEBUG) + +logger = logging.getLogger(__name__) + +# This avoids needing the internet for testing. +licenses.urls.tldextract.extract = tldextract.TLDExtract( + suffix_list_urls=None +) +image.columns.urls.tldextract.extract = tldextract.TLDExtract( + suffix_list_urls=None +) +IMAGE_COLUMN_NAMES = [x.NAME for x in image.IMAGE_TSV_COLUMNS] + + +@pytest.fixture +def setup_env(monkeypatch): + monkeypatch.setenv('OUTPUT_DIR', '/tmp') + + +@pytest.fixture +def mock_rewriter(monkeypatch): + def mock_rewrite_redirected_url(url_string): + return url_string + + monkeypatch.setattr( + licenses.urls, + 'rewrite_redirected_url', + mock_rewrite_redirected_url, + ) + + +@pytest.fixture +def get_good(monkeypatch): + def mock_get(url, timeout=60): + return requests.Response() + + monkeypatch.setattr(licenses.urls.requests, 'get', mock_get) + + +def test_MediaStore_uses_OUTPUT_DIR_variable( + monkeypatch, +): + testing_output_dir = '/my_output_dir' + monkeypatch.setenv('OUTPUT_DIR', testing_output_dir) + image_store = image.ImageStore() + assert testing_output_dir in image_store._OUTPUT_PATH + + +def test_MediaStore_falls_back_to_tmp_output_dir_variable( + monkeypatch, + setup_env, +): + monkeypatch.delenv('OUTPUT_DIR') + image_store = image.ImageStore() + assert '/tmp' in image_store._OUTPUT_PATH + + +def test_MediaStore_includes_provider_in_output_file_string( + setup_env, +): + image_store = image.ImageStore('test_provider') + assert type(image_store._OUTPUT_PATH) == str + assert 'test_provider' in image_store._OUTPUT_PATH + + +def test_MediaStore_includes_media_type_in_output_file_string( + setup_env, +): + image_store = image.ImageStore('test_provider') + assert type(image_store._OUTPUT_PATH) == str + assert 'image' in image_store._OUTPUT_PATH + + +def test_MediaStore_add_item_adds_realistic_image_to_buffer( + setup_env, mock_rewriter +): + license_url = 'https://creativecommons.org/publicdomain/zero/1.0/' + image_store = image.ImageStore(provider='testing_provider') + image_store.add_item( + foreign_landing_url='https://images.org/image01', + image_url='https://images.org/image01.jpg', + license_url=license_url, + ) + assert len(image_store._media_buffer) == 1 + + +def test_MediaStore_add_item_adds_multiple_images_to_buffer( + mock_rewriter, setup_env, +): + image_store = image.ImageStore(provider='testing_provider') + image_store.add_item( + foreign_landing_url='https://images.org/image01', + image_url='https://images.org/image01.jpg', + license_url='https://creativecommons.org/publicdomain/zero/1.0/' + ) + image_store.add_item( + foreign_landing_url='https://images.org/image02', + image_url='https://images.org/image02.jpg', + license_url='https://creativecommons.org/publicdomain/zero/1.0/' + ) + image_store.add_item( + foreign_landing_url='https://images.org/image03', + image_url='https://images.org/image03.jpg', + license_url='https://creativecommons.org/publicdomain/zero/1.0/' + ) + image_store.add_item( + foreign_landing_url='https://images.org/image04', + image_url='https://images.org/image04.jpg', + license_url='https://creativecommons.org/publicdomain/zero/1.0/' + ) + assert len(image_store._media_buffer) == 4 + + +def test_MediaStore_add_item_flushes_buffer( + mock_rewriter, setup_env, tmpdir, +): + output_file = 'testing.tsv' + tmp_directory = tmpdir + output_dir = str(tmp_directory) + tmp_file = tmp_directory.join(output_file) + tmp_path_full = str(tmp_file) + + image_store = image.ImageStore( + provider='testing_provider', + output_file=output_file, + output_dir=output_dir, + buffer_length=3 + ) + image_store.add_item( + foreign_landing_url='https://images.org/image01', + image_url='https://images.org/image01.jpg', + license_url='https://creativecommons.org/publicdomain/zero/1.0/' + ) + image_store.add_item( + foreign_landing_url='https://images.org/image02', + image_url='https://images.org/image02.jpg', + license_url='https://creativecommons.org/publicdomain/zero/1.0/' + ) + image_store.add_item( + foreign_landing_url='https://images.org/image03', + image_url='https://images.org/image03.jpg', + license_url='https://creativecommons.org/publicdomain/zero/1.0/' + ) + image_store.add_item( + foreign_landing_url='https://images.org/image04', + image_url='https://images.org/image04.jpg', + license_url='https://creativecommons.org/publicdomain/zero/1.0/' + ) + assert len(image_store._media_buffer) == 1 + with open(tmp_path_full) as f: + lines = f.read().split('\n') + assert len(lines) == 4 # recall the last '\n' will create an empty line. + + +def test_MediaStore_commit_writes_nothing_if_no_lines_in_buffer(): + image_store = image.ImageStore(output_dir='/path/does/not/exist') + image_store.commit() + + +def test_MediaStore_produces_correct_total_images(mock_rewriter, setup_env): + image_store = image.ImageStore(provider='testing_provider') + image_store.add_item( + foreign_landing_url='https://images.org/image01', + image_url='https://images.org/image01.jpg', + license_url='https://creativecommons.org/publicdomain/zero/1.0/' + ) + image_store.add_item( + foreign_landing_url='https://images.org/image02', + image_url='https://images.org/image02.jpg', + license_url='https://creativecommons.org/publicdomain/zero/1.0/' + ) + image_store.add_item( + foreign_landing_url='https://images.org/image03', + image_url='https://images.org/image03.jpg', + license_url='https://creativecommons.org/publicdomain/zero/1.0/' + ) + assert image_store.total_items == 3 + + +def test_MediaStore_get_valid_license_info_returns_None_when_license_is_None( + monkeypatch, + setup_env, +): + image_store = image.ImageStore() + + actual_image_data = image_store.validate_license_info({ + 'license_url': 'https://license/url', + 'license_': None, + 'license_version': '1.5', + 'foreign_landing_url': '', + 'image_url': '', + }) + assert actual_image_data is None + + +def test_MediaStore_get_valid_license_info_returns_None_when_license_is_invalid( + monkeypatch, + setup_env, +): + image_store = image.ImageStore() + + actual_license_info = image_store.validate_license_info({ + 'license_url': 'https://license/url', + 'license_': 'license', + 'license_version': '1.5', + 'foreign_landing_url': '', + 'image_url': '', + }) + assert actual_license_info is None + + +def test_MediaStore_clean_media_metadata_does_not_change_required_media_arguments( + monkeypatch, + setup_env, +): + image_url = 'test_url' + foreign_landing_url = 'foreign_landing_url' + image_store = image.ImageStore() + image_data = { + 'license_': 'by', + 'license_version': '4.0', + 'foreign_landing_url': foreign_landing_url, + 'image_url': image_url, + 'thumbnail_url': None, + 'foreign_identifier': None, + } + cleaned_data = image_store.clean_media_metadata(**image_data) + + assert cleaned_data['image_url'] == image_url + assert cleaned_data['foreign_landing_url'] == foreign_landing_url + + +def test_MediaStore_clean_media_metadata_adds_provider( + monkeypatch, + setup_env, +): + provider = 'test_provider' + image_store = image.ImageStore(provider=provider) + image_data = { + 'license_': 'by', + 'license_version': '4.0', + 'foreign_landing_url': None, + 'image_url': None, + } + cleaned_data = image_store.clean_media_metadata(**image_data) + + assert cleaned_data['provider'] == provider + + +def test_MediaStore_clean_media_metadata_adds_filesize( + monkeypatch, + setup_env, +): + image_store = image.ImageStore() + image_data = { + 'license_': 'by', + 'license_version': '4.0', + } + cleaned_data = image_store.clean_media_metadata(**image_data) + + assert 'filesize' in cleaned_data + assert cleaned_data['filesize'] is None + + +def test_MediaStore_clean_media_metadata_removes_license_urls( + monkeypatch, + setup_env, +): + image_store = image.ImageStore() + image_data = { + 'license_': 'by-nc-nd', + 'license_version': '4.0', + 'license_url': 'license', + 'foreign_landing_url': None, + 'image_url': None, + 'thumbnail_url': None, + 'foreign_identifier': None, + } + cleaned_data = image_store.clean_media_metadata(**image_data) + + assert 'license_url' not in cleaned_data + assert 'raw_license_url' not in cleaned_data + + +def test_MediaStore_clean_media_metadata_replaces_license_url_with_license_info( + monkeypatch, + setup_env, +): + license_url = 'https://creativecommons.org/licenses/by-nc-nd/4.0/' + image_store = image.ImageStore() + image_data = { + 'license_url': license_url, + 'license_': None, + 'license_version': None, + } + cleaned_data = image_store.clean_media_metadata(**image_data) + + expected_license = 'by-nc-nd' + expected_version = '4.0' + assert cleaned_data['license_'] == expected_license + assert cleaned_data['license_version'] == expected_version + assert 'license_url' not in cleaned_data + + +def test_MediaStore_clean_media_metadata_adds_license_urls_to_meta_data( + monkeypatch, + setup_env, +): + raw_license_url = 'raw_license' + license_url = 'https://creativecommons.org/licenses/by-nc-nd/4.0/' + image_store = image.ImageStore() + image_data = { + 'license_': 'by-nc-nd', + 'license_version': '4.0', + 'license_url': raw_license_url, + 'foreign_landing_url': None, + 'image_url': None, + 'thumbnail_url': None, + 'foreign_identifier': None, + 'ingestion_type': 'provider_api' + } + cleaned_data = image_store.clean_media_metadata(**image_data) + + assert cleaned_data['meta_data']['license_url'] == license_url + assert cleaned_data['meta_data']['raw_license_url'] == raw_license_url + + +def test_MediaStore_get_image_gets_source( + monkeypatch, + setup_env, +): + image_store = image.ImageStore() + + actual_image = image_store._get_image( + license_='by', + license_version='4.0', + foreign_landing_url=None, + image_url=None, + thumbnail_url=None, + foreign_identifier=None, + width=None, + height=None, + creator=None, + creator_url=None, + title=None, + meta_data=None, + raw_tags=None, + watermarked=None, + source='diff_source', + ingestion_type=None, + ) + assert actual_image.source == 'diff_source' + + +def test_MediaStore_sets_source_to_provider_if_source_is_none( + monkeypatch, + setup_env, +): + image_store = image.ImageStore(provider='test_provider') + + actual_image = image_store._get_image( + license_='by', + license_version='4.0', + foreign_landing_url=None, + image_url=None, + thumbnail_url=None, + foreign_identifier=None, + width=None, + height=None, + creator=None, + creator_url=None, + title=None, + meta_data=None, + raw_tags=None, + watermarked=None, + source=None, + ingestion_type=None, + ) + assert actual_image.source == 'test_provider' + + +def test_MediaStore_add_image_replaces_non_dict_meta_data_with_no_license_url( + setup_env, +): + image_store = image.ImageStore() + + def item_saver(arg): + pass + + with patch.object( + image_store, + 'save_item', + side_effect=item_saver) as mock_save: + image_store.add_item( + license_url=None, + license_='by-nc-nd', + license_version='4.0', + foreign_landing_url='', + image_url='', + thumbnail_url=None, + foreign_identifier=None, + width=None, + height=None, + creator=None, + creator_url=None, + title=None, + meta_data='notadict', + raw_tags=None, + watermarked=None, + source=None, + ingestion_type=None, + ) + actual_image = mock_save.call_args[0][0] + assert actual_image.meta_data == { + 'license_url': 'https://creativecommons.org/licenses/by-nc-nd/4.0/', + 'raw_license_url': None, + } + + +def test_MediaStore_add_item_creates_meta_data_with_valid_license_url( + monkeypatch, setup_env +): + image_store = image.ImageStore() + + license_url = "https://my.license.url" + valid_license_url = 'https://creativecommons.org/licenses/by/4.0/' + + def item_saver(arg): + pass + + with patch.object( + image_store, + 'save_item', + side_effect=item_saver) as mock_save: + image_store.add_item( + license_url=license_url, + license_='by', + license_version='4.0', + foreign_landing_url='', + image_url='', + thumbnail_url=None, + foreign_identifier=None, + width=None, + height=None, + creator=None, + creator_url=None, + title=None, + meta_data=None, + raw_tags=None, + watermarked=None, + source=None, + ingestion_type=None, + ) + actual_image = mock_save.call_args[0][0] + + assert actual_image.meta_data == { + 'license_url': valid_license_url, + 'raw_license_url': license_url + } + + +def test_MediaStore_add_item_adds_valid_license_url_to_dict_meta_data( + monkeypatch, setup_env +): + image_store = image.ImageStore() + + license_url = "https://my.license.url" + valid_license_url = 'https://creativecommons.org/licenses/by/4.0/' + + def item_saver(arg): + pass + + with patch.object( + image_store, + 'save_item', + side_effect=item_saver) as mock_save: + image_store.add_item( + license_url=license_url, + license_='by', + license_version='4.0', + foreign_landing_url='', + image_url='', + thumbnail_url=None, + foreign_identifier=None, + width=None, + height=None, + creator=None, + creator_url=None, + title=None, + meta_data={'key1': 'val1'}, + raw_tags=None, + watermarked=None, + source=None, + ingestion_type=None, + ) + actual_image = mock_save.call_args[0][0] + + assert actual_image.meta_data == { + 'key1': 'val1', + 'license_url': valid_license_url, + 'raw_license_url': license_url + } + + +def test_ImageStore_add_item_fixes_invalid_license_url( + monkeypatch, setup_env +): + image_store = image.ImageStore() + + original_url = "https://license/url" + updated_url = 'https://creativecommons.org/licenses/by-nc-sa/2.0/' + + def item_saver(arg): + pass + + with patch.object( + image_store, + 'save_item', + side_effect=item_saver) as mock_save: + image_store.add_item( + license_url=original_url, + license_='by-nc-sa', + license_version='2.0', + foreign_landing_url='', + image_url='', + meta_data={}, + ) + actual_image = mock_save.call_args[0][0] + + assert actual_image.meta_data == { + 'license_url': updated_url, 'raw_license_url': original_url + } + + +def test_MediaStore_get_image_enriches_singleton_tags( + setup_env, +): + image_store = image.ImageStore('test_provider') + + actual_image = image_store._get_image( + license_url='https://license/url', + license_='by-sa', + license_version='4.0', + foreign_landing_url=None, + image_url=None, + thumbnail_url=None, + foreign_identifier=None, + width=None, + height=None, + creator=None, + creator_url=None, + title=None, + meta_data=None, + raw_tags=['lone'], + watermarked=None, + source=None, + ingestion_type=None, + ) + + assert actual_image.tags == [{'name': 'lone', 'provider': 'test_provider'}] + + +def test_MediaStore_get_image_tag_blacklist( + setup_env, +): + raw_tags = [ + 'cc0', + 'valid', + 'garbage:=metacrap', + 'uploaded:by=flickrmobile', + { + 'name': 'uploaded:by=instagram', + 'provider': 'test_provider' + } + ] + + image_store = image.ImageStore('test_provider') + + actual_image = image_store._get_image( + license_='by', + license_version='4.0', + foreign_landing_url=None, + image_url=None, + meta_data=None, + raw_tags=raw_tags, + foreign_identifier=None, + thumbnail_url=None, + width=None, + height=None, + creator=None, + creator_url=None, + title=None, + watermarked=None, + ingestion_type=None, + ) + + assert actual_image.tags == [ + {'name': 'valid', 'provider': 'test_provider'} + ] + + +def test_MediaStore_get_image_enriches_multiple_tags( + setup_env, +): + image_store = image.ImageStore('test_provider') + actual_image = image_store._get_image( + license_url='https://license/url', + license_='by', + license_version='4.0', + foreign_landing_url=None, + image_url=None, + thumbnail_url=None, + foreign_identifier=None, + width=None, + height=None, + creator=None, + creator_url=None, + title=None, + meta_data=None, + raw_tags=['tagone', 'tag2', 'tag3'], + watermarked=None, + source=None, + ingestion_type=None, + ) + + assert actual_image.tags == [ + {'name': 'tagone', 'provider': 'test_provider'}, + {'name': 'tag2', 'provider': 'test_provider'}, + {'name': 'tag3', 'provider': 'test_provider'}, + ] + + +def test_ImageStore_get_image_leaves_preenriched_tags( + setup_env +): + image_store = image.ImageStore('test_provider') + tags = [ + {'name': 'tagone', 'provider': 'test_provider'}, + {'name': 'tag2', 'provider': 'test_provider'}, + {'name': 'tag3', 'provider': 'test_provider'}, + ] + + actual_image = image_store._get_image( + license_url='https://license/url', + license_='by', + license_version='4.0', + foreign_landing_url=None, + image_url=None, + thumbnail_url=None, + foreign_identifier=None, + width=None, + height=None, + creator=None, + creator_url=None, + title=None, + meta_data=None, + raw_tags=tags, + watermarked=None, + source=None, + ingestion_type=None, + ) + + assert actual_image.tags == tags + + +def test_ImageStore_get_image_nones_nonlist_tags( + setup_env, +): + image_store = image.ImageStore('test_provider') + tags = 'notalist' + + actual_image = image_store._get_image( + license_url='https://license/url', + license_='by', + license_version='4.0', + foreign_landing_url=None, + image_url=None, + thumbnail_url=None, + foreign_identifier=None, + width=None, + height=None, + creator=None, + creator_url=None, + title=None, + meta_data=None, + raw_tags=tags, + watermarked=None, + source=None, + ingestion_type=None, + ) + + assert actual_image.tags is None + + +@pytest.fixture +def default_image_args( + setup_env, +): + return dict( + foreign_identifier=None, + foreign_landing_url='https://image.org', + image_url='https://image.org', + thumbnail_url=None, + width=None, + height=None, + filesize=None, + license_='cc0', + license_version='1.0', + creator=None, + creator_url=None, + title=None, + meta_data=None, + tags=None, + watermarked=None, + provider=None, + source=None, + ingestion_type=None, + ) + + +def test_create_tsv_row_non_none_if_req_fields( + default_image_args, + get_good, + setup_env, +): + image_store = image.ImageStore() + test_image = image.Image(**default_image_args) + actual_row = image_store._create_tsv_row(test_image) + assert actual_row is not None + + +def test_create_tsv_row_none_if_no_foreign_landing_url( + default_image_args, + setup_env, +): + image_store = image.ImageStore() + image_args = default_image_args + image_args['foreign_landing_url'] = None + test_image = image.Image(**image_args) + expect_row = None + actual_row = image_store._create_tsv_row(test_image) + assert expect_row == actual_row + + +def test_create_tsv_row_none_if_no_license( + default_image_args, + setup_env, +): + image_store = image.ImageStore() + image_args = default_image_args + image_args['license_'] = None + test_image = image.Image(**image_args) + expect_row = None + actual_row = image_store._create_tsv_row(test_image) + assert expect_row == actual_row + + +def test_create_tsv_row_none_if_no_license_version( + default_image_args, + setup_env, +): + image_store = image.ImageStore() + image_args = default_image_args + image_args['license_version'] = None + test_image = image.Image(**image_args) + expect_row = None + actual_row = image_store._create_tsv_row(test_image) + assert expect_row == actual_row + + +def test_create_tsv_row_returns_none_if_missing_image_url( + default_image_args, + setup_env, +): + image_store = image.ImageStore() + image_args = default_image_args + image_args['image_url'] = None + test_image = image.Image(**image_args) + expect_row = None + actual_row = image_store._create_tsv_row(test_image) + assert expect_row == actual_row + + +def test_create_tsv_row_handles_empty_dict_and_tags( + default_image_args, + setup_env, +): + image_store = image.ImageStore() + meta_data = {} + tags = [] + image_args = default_image_args + image_args['meta_data'] = meta_data + image_args['tags'] = tags + test_image = image.Image(**image_args) + + actual_row = image_store._create_tsv_row(test_image).split('\t') + meta_data_col_id = IMAGE_COLUMN_NAMES.index('meta_data') + tags_col_id = IMAGE_COLUMN_NAMES.index('tags') + actual_meta_data, actual_tags = ( + actual_row[meta_data_col_id], actual_row[tags_col_id] + ) + expect_meta_data, expect_tags = '\\N', '\\N' + assert expect_meta_data == actual_meta_data + assert expect_tags == actual_tags + + +def test_create_tsv_row_turns_empty_into_nullchar( + default_image_args, + setup_env, +): + """ + Null values are converted into `N/A` in tsv files + This test first selects all the media properties with value None, + and then checks if all corresponding tsv values are `N/A`. + The last element has a new line at the end, so we check it separately + """ + image_store = image.ImageStore() + image_args = default_image_args + test_image = image.Image(**image_args) + + none_fields = [i for i, x in enumerate(test_image._fields) + if getattr(test_image, x) is None] + # none_field_names = [test_image._fields[x] for x in none_fields] + + actual_row = image_store._create_tsv_row(test_image).split('\t') + assert actual_row[-1] == '\\N\n' + + actual_row[-1] = '\\N' + assert all( + [ + actual_row[i] == '\\N' + for i in none_fields + ] + ) is True + + +def test_create_tsv_row_properly_places_entries( + setup_env, monkeypatch +): + def mock_validate_url(url_string): + return url_string + + monkeypatch.setattr( + image.columns.urls, 'validate_url_string', mock_validate_url + ) + image_store = image.ImageStore() + req_args_dict = { + 'foreign_landing_url': 'https://landing_page.com', + 'image_url': 'http://imageurl.com', + 'license_': 'testlicense', + 'license_version': '1.0', + } + args_dict = { + 'foreign_identifier': 'foreign_id', + 'thumbnail_url': 'http://thumbnail.com', + 'width': 200, + 'height': 500, + 'filesize': None, + 'creator': 'tyler', + 'creator_url': 'https://creatorurl.com', + 'title': 'agreatpicture', + 'meta_data': {'description': 'cat picture'}, + 'tags': [{'name': 'tag1', 'provider': 'testing'}], + 'watermarked': 'f', + 'provider': 'testing_provider', + 'source': 'testing_source', + 'ingestion_type': 'testing_ingestion', + } + args_dict.update(req_args_dict) + + test_image = image.Image(**args_dict) + actual_row = image_store._create_tsv_row( + test_image + ) + expect_row = '\t'.join([ + 'foreign_id', + 'https://landing_page.com', + 'http://imageurl.com', + 'http://thumbnail.com', + '200', + '500', + '\\N', + 'testlicense', + '1.0', + 'tyler', + 'https://creatorurl.com', + 'agreatpicture', + '{"description": "cat picture"}', + '[{"name": "tag1", "provider": "testing"}]', + 'f', + 'testing_provider', + 'testing_source', + 'testing_ingestion', + ]) + '\n' + assert expect_row == actual_row diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/brooklyn_museum.py b/src/cc_catalog_airflow/dags/provider_api_scripts/brooklyn_museum.py index 9514359a0..403433ada 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/brooklyn_museum.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/brooklyn_museum.py @@ -45,12 +45,12 @@ def main(): logger.debug(len(objects_batch)) if type(objects_batch) == list and len(objects_batch) > 0: _process_objects_batch(objects_batch) - logger.debug(f"Images till now {image_store.total_images}") + logger.debug(f"Images till now {image_store.total_items}") offset += LIMIT else: condition = False image_store.commit() - logger.info(f"Total images recieved {image_store.total_images}") + logger.info(f"Total images received {image_store.total_items}") def _get_query_param( diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/cleveland_museum_of_art.py b/src/cc_catalog_airflow/dags/provider_api_scripts/cleveland_museum_of_art.py index 96bd9f62d..25e7d23d9 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/cleveland_museum_of_art.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/cleveland_museum_of_art.py @@ -122,17 +122,17 @@ def _handle_response(batch): creator_name = '' total_images = image_store.add_item( - foreign_landing_url=foreign_landing_url, - image_url=image_url, - license_=license_, - license_version=license_version, - foreign_identifier=foreign_id, - width=width, - height=height, - title=title, - creator=creator_name, - meta_data=metadata, - ) + foreign_landing_url=foreign_landing_url, + image_url=image_url, + license_=license_, + license_version=license_version, + foreign_identifier=foreign_id, + width=width, + height=height, + title=title, + creator=creator_name, + meta_data=metadata, + ) return total_images diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/nypl.py b/src/cc_catalog_airflow/dags/provider_api_scripts/nypl.py index c9d171b48..2bd35208d 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/nypl.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/nypl.py @@ -56,12 +56,12 @@ def main(): results = request_response.get("result") if type(results) == list and len(results) > 0: _handle_results(results) - logger.info(f"{image_store.total_images} images till now") + logger.info(f"{image_store.total_items} images till now") page = page + 1 else: condition = False image_store.commit() - logger.info(f"total images {image_store.total_images}") + logger.info(f"total images {image_store.total_items}") def _get_query_param( diff --git a/src/cc_catalog_airflow/dags/provider_api_scripts/wikimedia_commons.py b/src/cc_catalog_airflow/dags/provider_api_scripts/wikimedia_commons.py index 0cd904861..862114047 100644 --- a/src/cc_catalog_airflow/dags/provider_api_scripts/wikimedia_commons.py +++ b/src/cc_catalog_airflow/dags/provider_api_scripts/wikimedia_commons.py @@ -90,13 +90,13 @@ def main(date): image_pages = _get_image_pages(image_batch) if image_pages: _process_image_pages(image_pages) - total_images = image_store.total_images + total_images = image_store.total_items logger.info(f'Total Images so far: {total_images}') if not continue_token: break image_store.commit() - total_images = image_store.total_images + total_images = image_store.total_items logger.info(f'Total images: {total_images}') logger.info('Terminated!') diff --git a/src/cc_catalog_airflow/dags/util/loader/ingestion_column.py b/src/cc_catalog_airflow/dags/util/loader/ingestion_column.py index 2e6fe74c8..863b0629b 100644 --- a/src/cc_catalog_airflow/dags/util/loader/ingestion_column.py +++ b/src/cc_catalog_airflow/dags/util/loader/ingestion_column.py @@ -6,7 +6,7 @@ import logging import os -from common.storage.image import _IMAGE_TSV_COLUMNS +from common.storage.image import IMAGE_TSV_COLUMNS logger = logging.getLogger(__name__) @@ -25,7 +25,7 @@ def check_and_fix_tsv_file(tsv_file_name): # If no media file is set in the filename, it is # probably image media_type = 'image' - old_cols_number = len(_IMAGE_TSV_COLUMNS) - 1 + old_cols_number = len(IMAGE_TSV_COLUMNS) - 1 if media_type == 'audio': # TODO: when audio is added: # old_cols_number = len(AUDIO_TSV_COLUMNS) - 1 diff --git a/src/cc_catalog_airflow/dags/util/pg_cleaner.py b/src/cc_catalog_airflow/dags/util/pg_cleaner.py index ca85f211e..50410479d 100644 --- a/src/cc_catalog_airflow/dags/util/pg_cleaner.py +++ b/src/cc_catalog_airflow/dags/util/pg_cleaner.py @@ -197,7 +197,7 @@ def _select_records(postgres_conn_id, prefix, image_table=IMAGE_TABLE_NAME): def _clean_single_row(record, image_store_dict, prefix): dirty_row = ImageTableRow(*record) image_store = image_store_dict[(dirty_row.provider, prefix)] - total_images_before = image_store.total_images + total_images_before = image_store.total_items license_lower = dirty_row.license_.lower() if dirty_row.license_ else None tags_list = [t for t in dirty_row.tags if t] if dirty_row.tags else None image_store.add_item( @@ -218,7 +218,7 @@ def _clean_single_row(record, image_store_dict, prefix): watermarked=dirty_row.watermarked, source=dirty_row.source, ) - if not image_store.total_images - total_images_before == 1: + if not image_store.total_items - total_images_before == 1: logger.warning(f"Record {dirty_row} was not stored!") _save_failure_identifier(dirty_row.identifier) @@ -233,7 +233,7 @@ def _save_failure_identifier(identifier, output_path=OUTPUT_PATH): def _log_and_check_totals(total_rows, image_store_dict): - image_totals = {k: v.total_images for k, v in image_store_dict.items()} + image_totals = {k: v.total_items for k, v in image_store_dict.items()} total_images_sum = sum(image_totals.values()) logger.info(f"Total images cleaned: {total_images_sum}") logger.info(f"Image Totals breakdown: {image_totals}") diff --git a/src/cc_catalog_airflow/dags/util/test_pg_cleaner.py b/src/cc_catalog_airflow/dags/util/test_pg_cleaner.py index 93f8f2bbf..d9d76f6b7 100644 --- a/src/cc_catalog_airflow/dags/util/test_pg_cleaner.py +++ b/src/cc_catalog_airflow/dags/util/test_pg_cleaner.py @@ -1222,7 +1222,7 @@ def test_clean_single_row_doesnt_reuse_wrong_image_store_and_adds_row(): def test_log_and_check_totals_raises_when_number_of_images_cleaned_is_wrong( monkeypatch, ): - monkeypatch.setattr(pg_cleaner.image.ImageStore, "total_images", 1) + monkeypatch.setattr(pg_cleaner.image.ImageStore, "total_items", 1) expected_calls = [ call.info("Total images cleaned: 2"), call.info( @@ -1243,7 +1243,7 @@ def test_log_and_check_totals_raises_when_number_of_images_cleaned_is_wrong( def test_log_and_check_totals_logs(monkeypatch): - monkeypatch.setattr(pg_cleaner.image.ImageStore, "total_images", 1) + monkeypatch.setattr(pg_cleaner.image.ImageStore, "total_items", 1) expected_calls = [ call.info("Total images cleaned: 2"), call.info(