WordPress · rwidom · Jan 16, 2023 · Oct 10, 2022 · Oct 10, 2022 · Oct 10, 2022
@@ -13,26 +13,27 @@
 
 
 # Default provider names
-FLICKR_DEFAULT_PROVIDER = "flickr"
-EUROPEANA_DEFAULT_PROVIDER = "europeana"
-WIKIMEDIA_AUDIO_PROVIDER = "wikimedia_audio"
-WIKIMEDIA_DEFAULT_PROVIDER = "wikimedia"
-SMITHSONIAN_DEFAULT_PROVIDER = "smithsonian"
 BROOKLYN_DEFAULT_PROVIDER = "brooklynmuseum"
 CLEVELAND_DEFAULT_PROVIDER = "clevelandmuseum"
+EUROPEANA_DEFAULT_PROVIDER = "europeana"
+FINNISH_DEFAULT_PROVIDER = "finnishmuseums"
+FLICKR_DEFAULT_PROVIDER = "flickr"
+FREESOUND_DEFAULT_PROVIDER = "freesound"
+INATURALIST_DEFAULT_PROVIDER = "inaturalist"
+JAMENDO_DEFAULT_PROVIDER = "jamendo"
 METROPOLITAN_MUSEUM_DEFAULT_PROVIDER = "met"
-VICTORIA_DEFAULT_PROVIDER = "museumsvictoria"
+NAPPY_DEFAULT_PROVIDER = "nappy"
 NYPL_DEFAULT_PROVIDER = "nypl"
 RAWPIXEL_DEFAULT_PROVIDER = "rawpixel"
 SCIENCE_DEFAULT_PROVIDER = "sciencemuseum"
+SMITHSONIAN_DEFAULT_PROVIDER = "smithsonian"
 SMK_DEFAULT_PROVIDER = "smk"
-WALTERS_DEFAULT_PROVIDER = "waltersartmuseum"
-FINNISH_DEFAULT_PROVIDER = "finnishmuseums"
-JAMENDO_DEFAULT_PROVIDER = "jamendo"
 STOCKSNAP_DEFAULT_PROVIDER = "stocksnap"
+VICTORIA_DEFAULT_PROVIDER = "museumsvictoria"
+WALTERS_DEFAULT_PROVIDER = "waltersartmuseum"
+WIKIMEDIA_AUDIO_PROVIDER = "wikimedia_audio"
+WIKIMEDIA_DEFAULT_PROVIDER = "wikimedia"
 WORDPRESS_DEFAULT_PROVIDER = "wordpress"
-FREESOUND_DEFAULT_PROVIDER = "freesound"
-INATURALIST_DEFAULT_PROVIDER = "inaturalist"
 
 # Finnish parameters
 FINNISH_SUB_PROVIDERS = {
@@ -136,6 +137,7 @@ class ImageCategory(Enum):
     "mccordmuseum": ImageCategory.DIGITIZED_ARTWORK.value,
     "met": ImageCategory.DIGITIZED_ARTWORK.value,
     "museumsvictoria": ImageCategory.DIGITIZED_ARTWORK.value,
+    "nappy": ImageCategory.PHOTOGRAPH.value,
     "phylopic": ImageCategory.ILLUSTRATION.value,
     "rijksmuseum": ImageCategory.DIGITIZED_ARTWORK.value,
     "sciencemuseum": ImageCategory.PHOTOGRAPH.value,

@@ -0,0 +1,111 @@
+"""
+Content Provider:       Nappy
+
+ETL Process:            Use the API to identify all CC0-licensed images.
+
+Output:                 TSV file containing the image meta-data.
+
+Notes:                  This api was written specially for Openverse.
+                        There are no known limits or restrictions.
+
+"""
+import logging
+
+from common import constants
+from common.licenses import get_license_info
+from common.loader import provider_details as prov
+from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester
+
+
+logger = logging.getLogger(__name__)
+
+
+class NappyDataIngester(ProviderDataIngester):
+    providers = {"image": prov.NAPPY_DEFAULT_PROVIDER}
+    endpoint = "https://api.nappy.co/v1/openverse/images"
+    # TODO The following are set to their default values. Remove them if the defaults
+    # are acceptible, or override them.
+    delay = 1
+    retries = 3
+    headers = {"Accept": "application/json"}
+
+    def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict:
+        # On the first request, `prev_query_params` will be `None`. We can detect this
+        # and return our default params.
+        if not prev_query_params:
+            return {
+                "page": 1,
+            }
+        else:
+            return {
+                **prev_query_params,
+                "page": prev_query_params["page"] + 1,
+            }
+
+    def get_batch_data(self, response_json):
+        # Takes the raw API response from calling `get` on the endpoint, and returns
+        # the list of records to process.
+        if response_json:
+            return response_json.get("images")
+        return None
+
+    def get_should_continue(self, response_json):
+        return bool(response_json.get("next_page"))
+
+    def get_media_type(self, record: dict):
+        return constants.IMAGE
+
+    def get_record_data(self, data: dict) -> dict | list[dict] | None:
+        # Parse out the necessary info from the record data into a dictionary.
+
+        if (foreign_landing_url := data.get("foreign_landing_url")) is None:
+            return None
+
+        if (image_url := data.get("url")) is None:
+            return None
+
+        # Hardoded to CC0, the only license Nappy.co uses
+        license_info = get_license_info(
+            "https://creativecommons.org/publicdomain/zero/1.0/"
+        )
+        if license_info is None:
+            return None
-        if license_info is None:
-            return None
-        if license_info is None:
-            return None
+
+        # OPTIONAL FIELDS
+        # Obtain as many optional fields as possible.
+        foreign_identifier = data.get("foreign_identifier")
+        thumbnail_url = data.get("url") + "?auto=format&w=600&q=75"
+        filesize = data.get("filesize")
+        filetype = data.get("filetype")
+        creator = data.get("creator")
+        creator_url = data.get("creator_url")
+        title = data.get("title")
+        meta_data = data.get("meta_data")
+        raw_tags = data.get("tags").split(",")
+
+        return {
+            "foreign_landing_url": foreign_landing_url,
+            "image_url": image_url,
+            "license_info": license_info,
+            "foreign_identifier": foreign_identifier,
+            "thumbnail_url": thumbnail_url,
+            "filesize": filesize,
+            "filetype": filetype,
+            "creator": creator,
+            "creator_url": creator_url,
+            "title": title,
+            "meta_data": meta_data,
+            "raw_tags": raw_tags,
+        }
+
+
+def main():
+    # Allows running ingestion from the CLI without Airflow running for debugging
+    # purposes.
+    logger.info("Begin: Nappy data ingestion")
+    ingester = NappyDataIngester()
+    ingester.ingest_records()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/openverse_catalog/dags/providers/provider_api_scripts/provider_data_ingester.py b/openverse_catalog/dags/providers/provider_api_scripts/provider_data_ingester.py
@@ -68,8 +68,20 @@ class ProviderDataIngester(ABC):
     @abstractmethod
     def providers(self) -> dict[str, str]:
         """
-        A dictionary whose keys are the supported `media_types`, and values are
-        the `provider` string in the `media` table of the DB for that type.
+        A dictionary mapping each supported media type to its corresponding
+        `provider` string (the string that will populate the `provider` field
+        in the Catalog DB). These strings should be defined as constants in
+        common.loader.provider_details.py
+
+        By convention, when a provider supports multiple media types we set
+        separate provider strings for each type. For example:
+
+        ```
+        providers = {
+            "image": provider_details.MYPROVIDER_IMAGE_PROVIDER,
+            "audio": provider_details.MYPROVIDER_AUDIO_PROVIDER,
+        }
+        ```
         """
         pass
 
@@ -105,7 +117,7 @@ def __init__(self, conf: dict = None, date: str = None):
         self.delayed_requester = DelayedRequester(
             delay=self.delay, headers=self.headers
         )
-        self.media_stores = self.init_media_stores()
+        self.media_stores = self._init_media_stores()
         self.date = date
 
         # dag_run configuration options
@@ -126,7 +138,7 @@ def __init__(self, conf: dict = None, date: str = None):
             # Create a generator to facilitate fetching the next set of query_params.
             self.override_query_params = (qp for qp in query_params_list)
 
-    def init_media_stores(self) -> dict[str, MediaStore]:
+    def _init_media_stores(self) -> dict[str, MediaStore]:
         """
         Initialize a media store for each media type supported by this
         provider.
@@ -153,7 +165,7 @@ def ingest_records(self, **kwargs) -> None:
         logger.info(f"Begin ingestion for {self.__class__.__name__}")
 
         while should_continue:
-            query_params = self.get_query_params(query_params, **kwargs)
+            query_params = self._get_query_params(query_params, **kwargs)
             if query_params is None:
                 # Break out of ingestion if no query_params are supplied. This can
                 # happen when the final `override_query_params` is processed.
@@ -175,7 +187,7 @@ def ingest_records(self, **kwargs) -> None:
 
                 # If errors have already been caught during processing, raise them
                 # as well.
-                if error_summary := self.get_ingestion_errors():
+                if error_summary := self._get_ingestion_errors():
                     raise error_summary from error
                 raise
 
@@ -192,21 +204,21 @@ def ingest_records(self, **kwargs) -> None:
 
                 # Commit whatever records we were able to process, and rethrow the
                 # exception so the taskrun fails.
-                self.commit_records()
+                self._commit_records()
                 raise error from ingestion_error
 
             if self.limit and record_count >= self.limit:
                 logger.info(f"Ingestion limit of {self.limit} has been reached.")
                 should_continue = False
 
         # Commit whatever records we were able to process
-        self.commit_records()
+        self._commit_records()
 
         # If errors were caught during processing, raise them now
-        if error_summary := self.get_ingestion_errors():
+        if error_summary := self._get_ingestion_errors():
             raise error_summary
 
-    def get_ingestion_errors(self) -> AggregateIngestionError | None:
+    def _get_ingestion_errors(self) -> AggregateIngestionError | None:
         """
         If any errors were skipped during ingestion, log them as well as the
         associated query parameters. Then return an AggregateIngestionError.
@@ -235,10 +247,13 @@ def get_ingestion_errors(self) -> AggregateIngestionError | None:
             )
         return None
 
-    def get_query_params(self, prev_query_params: dict | None, **kwargs) -> dict | None:
+    def _get_query_params(
+        self, prev_query_params: dict | None, **kwargs
+    ) -> dict | None:
         """
         Returns the next set of query_params for the next request, handling
-        optional overrides via the dag_run conf.
+        optional overrides via the dag_run conf. This method should not be overridden;
+        instead override get_next_query_params.
         """
         # If we are getting query_params for the first batch and initial_query_params
         # have been set, return them.
@@ -391,7 +406,7 @@ def get_record_data(self, data: dict) -> dict | list[dict] | None:
         """
         pass
 
-    def commit_records(self) -> int:
+    def _commit_records(self) -> int:
         total = 0
         for store in self.media_stores.values():
             total += store.commit()

@@ -9,6 +9,7 @@
 from providers.provider_api_scripts.inaturalist import INaturalistDataIngester
 from providers.provider_api_scripts.metropolitan_museum import MetMuseumDataIngester
 from providers.provider_api_scripts.museum_victoria import VictoriaDataIngester
+from providers.provider_api_scripts.nappy import NappyDataIngester
 from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester
 from providers.provider_api_scripts.science_museum import ScienceMuseumDataIngester
 from providers.provider_api_scripts.smk import SmkDataIngester
@@ -168,6 +169,10 @@ def __post_init__(self):
         ingestion_callable=VictoriaDataIngester,
         start_date=datetime(2020, 1, 1),
     ),
+    ProviderWorkflow(
+        provider_script="nappy",
+        ingestion_callable=NappyDataIngester,
+    ),
     ProviderWorkflow(
         provider_script="nypl",
         start_date=datetime(2020, 1, 1),