This repository has been archived by the owner on Aug 4, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 54
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a Nappy provider DAG using ProviderDataIngester (#796)
* _-prefix methods that should not be overridden * Initial template * Add initial docs * Update template, add test template file * Add script to generate template files * Update docs to reference script * Moving more documentation into the code * Reformat docs - Breaks out into several files - Removes documentation that is redundant (copied from code) - Prefers documentation within the template - Explicitly documents advanced options as FAQ - Some small updates to the templating * Small tweaks * Remove unused 'license_url' from nappy and comment out unused test imports * Remove unused 'license_url' from nappy and comment out unused test imports * write small helper fn for filesizes * Add UA string header * move thumbnail_url to metadata for now * rename thumbnail_url metadata field to thumbnail * add dag start date * no header in next params & add thumbnail_url * add tests and test resources * remove questionable tag from test image * update docs * add popularity metrics to metadata * Add url to source docs Co-authored-by: Madison Swain-Bowden <[email protected]> * remove template comment from next query params Co-authored-by: Madison Swain-Bowden <[email protected]> * remove template comment on optional fields Co-authored-by: Madison Swain-Bowden <[email protected]> * remove template comment on get batch Co-authored-by: Madison Swain-Bowden <[email protected]> * remove template comment from main Co-authored-by: Madison Swain-Bowden <[email protected]> * remove template comment from get_record_data Co-authored-by: Madison Swain-Bowden <[email protected]> * pass batch_limit to the API Co-authored-by: Madison Swain-Bowden <[email protected]> * tests for batch limit API parameter * point to popularity metrics * template test directory fix * make license info a class variable * Remove outdated/duplicated template creation files * Update DAG documentation * fortify and test convert filesize Co-authored-by: Staci Cooper <[email protected]> Co-authored-by: rwidom <[email protected]> Co-authored-by: rwidom <[email protected]> Co-authored-by: Madison Swain-Bowden <[email protected]>
- Loading branch information
1 parent
2a0a1c3
commit 709a466
Showing
9 changed files
with
509 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
121 changes: 121 additions & 0 deletions
121
openverse_catalog/dags/providers/provider_api_scripts/nappy.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
""" | ||
Content Provider: Nappy | ||
ETL Process: Use the API to identify all CC0-licensed images. | ||
Output: TSV file containing the image meta-data. | ||
Notes: This api was written specially for Openverse. | ||
There are no known limits or restrictions. | ||
https://nappy.co/ | ||
""" | ||
import logging | ||
|
||
from common import constants | ||
from common.licenses import get_license_info | ||
from common.loader import provider_details as prov | ||
from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class NappyDataIngester(ProviderDataIngester): | ||
providers = {constants.IMAGE: prov.NAPPY_DEFAULT_PROVIDER} | ||
endpoint = "https://api.nappy.co/v1/openverse/images" | ||
headers = {"User-Agent": prov.UA_STRING, "Accept": "application/json"} | ||
|
||
# Hardoded to CC0, the only license Nappy.co uses | ||
license_info = get_license_info( | ||
"https://creativecommons.org/publicdomain/zero/1.0/" | ||
) | ||
|
||
def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict: | ||
if not prev_query_params: | ||
return { | ||
"page": 1, | ||
"per_page": self.batch_limit, | ||
} | ||
else: | ||
return { | ||
**prev_query_params, | ||
"page": prev_query_params["page"] + 1, | ||
} | ||
|
||
def get_batch_data(self, response_json): | ||
if response_json: | ||
return response_json.get("images") | ||
return None | ||
|
||
def get_should_continue(self, response_json): | ||
return bool(response_json.get("next_page")) | ||
|
||
def get_media_type(self, record: dict): | ||
return constants.IMAGE | ||
|
||
@staticmethod | ||
def _convert_filesize(raw_filesize_string: str) -> int: | ||
""" | ||
Convert sizes from strings to byte integers, ex. "187.8kB" to 188. | ||
""" | ||
FILETYPE_MULTIPLIERS = {"kB": 1000, "MB": 1_000_000, "GB": 1_000_000_000} | ||
if isinstance(raw_filesize_string, str) and len(raw_filesize_string) > 2: | ||
stripped = raw_filesize_string.strip() | ||
if stripped[-2:] in FILETYPE_MULTIPLIERS: | ||
try: | ||
units = float(stripped[:-2]) | ||
except ValueError: | ||
return | ||
multiplier = FILETYPE_MULTIPLIERS[stripped[-2:]] | ||
return round(units * multiplier) | ||
|
||
def get_record_data(self, data: dict) -> dict | list[dict] | None: | ||
if (foreign_landing_url := data.get("foreign_landing_url")) is None: | ||
return None | ||
|
||
if (image_url := data.get("url")) is None: | ||
return None | ||
|
||
foreign_identifier = data.get("foreign_identifier") | ||
thumbnail_url = data.get("url") + "?auto=format&w=600&q=75" | ||
filesize = self._convert_filesize(data.get("filesize")) | ||
filetype = data.get("filetype") | ||
creator = data.get("creator") | ||
creator_url = data.get("creator_url") | ||
title = data.get("title") | ||
meta_data = { | ||
"views": data.get("views"), | ||
"saves": data.get("saves"), | ||
"downloads": data.get("downloads"), | ||
} | ||
raw_tags = data.get("tags").split(",") | ||
width = data.get("width") | ||
height = data.get("height") | ||
|
||
return { | ||
"foreign_landing_url": foreign_landing_url, | ||
"image_url": image_url, | ||
"thumbnail_url": thumbnail_url, | ||
"license_info": self.license_info, | ||
"foreign_identifier": foreign_identifier, | ||
"filesize": filesize, | ||
"filetype": filetype, | ||
"creator": creator, | ||
"creator_url": creator_url, | ||
"title": title, | ||
"meta_data": meta_data, | ||
"raw_tags": raw_tags, | ||
"width": width, | ||
"height": height, | ||
} | ||
|
||
|
||
def main(): | ||
logger.info("Begin: Nappy data ingestion") | ||
ingester = NappyDataIngester() | ||
ingester.ingest_records() | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.