diff --git a/DAGs.md b/DAGs.md index f53eb4196..3059d474b 100644 --- a/DAGs.md +++ b/DAGs.md @@ -67,6 +67,7 @@ The following are DAGs grouped by their primary tag: | [`jamendo_workflow`](#jamendo_workflow) | `@monthly` | `False` | audio | | [`metropolitan_museum_workflow`](#metropolitan_museum_workflow) | `@daily` | `True` | image | | `museum_victoria_workflow` | `@monthly` | `False` | image | +| [`nappy_workflow`](#nappy_workflow) | `@monthly` | `False` | image | | `nypl_workflow` | `@monthly` | `False` | image | | [`phylopic_workflow`](#phylopic_workflow) | `@daily` | `True` | image | | [`rawpixel_workflow`](#rawpixel_workflow) | `@monthly` | `False` | image | @@ -105,6 +106,7 @@ The following is documentation associated with each DAG (where available): 1. [`jamendo_workflow`](#jamendo_workflow) 1. [`metropolitan_museum_reingestion_workflow`](#metropolitan_museum_reingestion_workflow) 1. [`metropolitan_museum_workflow`](#metropolitan_museum_workflow) +1. [`nappy_workflow`](#nappy_workflow) 1. [`oauth2_authorization`](#oauth2_authorization) 1. [`oauth2_token_refresh`](#oauth2_token_refresh) 1. [`phylopic_reingestion_workflow`](#phylopic_reingestion_workflow) @@ -376,6 +378,17 @@ blocking during local development testing. connect with just date and license. https://collectionapi.metmuseum.org/public/collection/v1/search?isPublicDomain=true&metadataDate=2022-08-07 +## `nappy_workflow` + +Content Provider: Nappy + +ETL Process: Use the API to identify all CC0-licensed images. + +Output: TSV file containing the image meta-data. + +Notes: This api was written specially for Openverse. There are no known limits +or restrictions. https://nappy.co/ + ## `oauth2_authorization` ### OAuth Provider Authorization diff --git a/docker/local_postgres/0004_openledger_image_view.sql b/docker/local_postgres/0004_openledger_image_view.sql index d06d06148..20dc28f49 100644 --- a/docker/local_postgres/0004_openledger_image_view.sql +++ b/docker/local_postgres/0004_openledger_image_view.sql @@ -12,9 +12,10 @@ INSERT INTO public.image_popularity_metrics ( provider, metric, percentile ) VALUES ('flickr', 'views', 0.85), - ('wikimedia', 'global_usage_count', 0.85), + ('nappy', 'downloads', 0.85), + ('rawpixel', 'download_count', 0.85), ('stocksnap', 'downloads_raw', 0.85), - ('rawpixel', 'download_count', 0.85) + ('wikimedia', 'global_usage_count', 0.85) ; diff --git a/openverse_catalog/dags/common/loader/provider_details.py b/openverse_catalog/dags/common/loader/provider_details.py index 92c22a0ad..c942ec883 100644 --- a/openverse_catalog/dags/common/loader/provider_details.py +++ b/openverse_catalog/dags/common/loader/provider_details.py @@ -13,26 +13,27 @@ # Default provider names -FLICKR_DEFAULT_PROVIDER = "flickr" -EUROPEANA_DEFAULT_PROVIDER = "europeana" -WIKIMEDIA_AUDIO_PROVIDER = "wikimedia_audio" -WIKIMEDIA_DEFAULT_PROVIDER = "wikimedia" -SMITHSONIAN_DEFAULT_PROVIDER = "smithsonian" BROOKLYN_DEFAULT_PROVIDER = "brooklynmuseum" CLEVELAND_DEFAULT_PROVIDER = "clevelandmuseum" +EUROPEANA_DEFAULT_PROVIDER = "europeana" +FINNISH_DEFAULT_PROVIDER = "finnishmuseums" +FLICKR_DEFAULT_PROVIDER = "flickr" +FREESOUND_DEFAULT_PROVIDER = "freesound" +INATURALIST_DEFAULT_PROVIDER = "inaturalist" +JAMENDO_DEFAULT_PROVIDER = "jamendo" METROPOLITAN_MUSEUM_DEFAULT_PROVIDER = "met" -VICTORIA_DEFAULT_PROVIDER = "museumsvictoria" +NAPPY_DEFAULT_PROVIDER = "nappy" NYPL_DEFAULT_PROVIDER = "nypl" RAWPIXEL_DEFAULT_PROVIDER = "rawpixel" SCIENCE_DEFAULT_PROVIDER = "sciencemuseum" +SMITHSONIAN_DEFAULT_PROVIDER = "smithsonian" SMK_DEFAULT_PROVIDER = "smk" -WALTERS_DEFAULT_PROVIDER = "waltersartmuseum" -FINNISH_DEFAULT_PROVIDER = "finnishmuseums" -JAMENDO_DEFAULT_PROVIDER = "jamendo" STOCKSNAP_DEFAULT_PROVIDER = "stocksnap" +VICTORIA_DEFAULT_PROVIDER = "museumsvictoria" +WALTERS_DEFAULT_PROVIDER = "waltersartmuseum" +WIKIMEDIA_AUDIO_PROVIDER = "wikimedia_audio" +WIKIMEDIA_DEFAULT_PROVIDER = "wikimedia" WORDPRESS_DEFAULT_PROVIDER = "wordpress" -FREESOUND_DEFAULT_PROVIDER = "freesound" -INATURALIST_DEFAULT_PROVIDER = "inaturalist" PHYLOPIC_DEFAULT_PROVIDER = "phylopic" # Finnish parameters @@ -138,6 +139,7 @@ class ImageCategory(Enum): "mccordmuseum": ImageCategory.DIGITIZED_ARTWORK.value, "met": ImageCategory.DIGITIZED_ARTWORK.value, "museumsvictoria": ImageCategory.DIGITIZED_ARTWORK.value, + "nappy": ImageCategory.PHOTOGRAPH.value, "phylopic": ImageCategory.ILLUSTRATION.value, "rijksmuseum": ImageCategory.DIGITIZED_ARTWORK.value, "sciencemuseum": ImageCategory.PHOTOGRAPH.value, diff --git a/openverse_catalog/dags/common/popularity/sql.py b/openverse_catalog/dags/common/popularity/sql.py index a7c8d7a6e..aabd90065 100644 --- a/openverse_catalog/dags/common/popularity/sql.py +++ b/openverse_catalog/dags/common/popularity/sql.py @@ -43,9 +43,10 @@ IMAGE_POPULARITY_METRICS = { "flickr": {"metric": "views"}, - "wikimedia": {"metric": "global_usage_count"}, - "stocksnap": {"metric": "downloads_raw"}, + "nappy": {"metric": "downloads"}, "rawpixel": {"metric": "download_count"}, + "stocksnap": {"metric": "downloads_raw"}, + "wikimedia": {"metric": "global_usage_count"}, } AUDIO_POPULARITY_METRICS = { diff --git a/openverse_catalog/dags/providers/provider_api_scripts/nappy.py b/openverse_catalog/dags/providers/provider_api_scripts/nappy.py new file mode 100644 index 000000000..2aa427ead --- /dev/null +++ b/openverse_catalog/dags/providers/provider_api_scripts/nappy.py @@ -0,0 +1,121 @@ +""" +Content Provider: Nappy + +ETL Process: Use the API to identify all CC0-licensed images. + +Output: TSV file containing the image meta-data. + +Notes: This api was written specially for Openverse. + There are no known limits or restrictions. + https://nappy.co/ + +""" +import logging + +from common import constants +from common.licenses import get_license_info +from common.loader import provider_details as prov +from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester + + +logger = logging.getLogger(__name__) + + +class NappyDataIngester(ProviderDataIngester): + providers = {constants.IMAGE: prov.NAPPY_DEFAULT_PROVIDER} + endpoint = "https://api.nappy.co/v1/openverse/images" + headers = {"User-Agent": prov.UA_STRING, "Accept": "application/json"} + + # Hardoded to CC0, the only license Nappy.co uses + license_info = get_license_info( + "https://creativecommons.org/publicdomain/zero/1.0/" + ) + + def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict: + if not prev_query_params: + return { + "page": 1, + "per_page": self.batch_limit, + } + else: + return { + **prev_query_params, + "page": prev_query_params["page"] + 1, + } + + def get_batch_data(self, response_json): + if response_json: + return response_json.get("images") + return None + + def get_should_continue(self, response_json): + return bool(response_json.get("next_page")) + + def get_media_type(self, record: dict): + return constants.IMAGE + + @staticmethod + def _convert_filesize(raw_filesize_string: str) -> int: + """ + Convert sizes from strings to byte integers, ex. "187.8kB" to 188. + """ + FILETYPE_MULTIPLIERS = {"kB": 1000, "MB": 1_000_000, "GB": 1_000_000_000} + if isinstance(raw_filesize_string, str) and len(raw_filesize_string) > 2: + stripped = raw_filesize_string.strip() + if stripped[-2:] in FILETYPE_MULTIPLIERS: + try: + units = float(stripped[:-2]) + except ValueError: + return + multiplier = FILETYPE_MULTIPLIERS[stripped[-2:]] + return round(units * multiplier) + + def get_record_data(self, data: dict) -> dict | list[dict] | None: + if (foreign_landing_url := data.get("foreign_landing_url")) is None: + return None + + if (image_url := data.get("url")) is None: + return None + + foreign_identifier = data.get("foreign_identifier") + thumbnail_url = data.get("url") + "?auto=format&w=600&q=75" + filesize = self._convert_filesize(data.get("filesize")) + filetype = data.get("filetype") + creator = data.get("creator") + creator_url = data.get("creator_url") + title = data.get("title") + meta_data = { + "views": data.get("views"), + "saves": data.get("saves"), + "downloads": data.get("downloads"), + } + raw_tags = data.get("tags").split(",") + width = data.get("width") + height = data.get("height") + + return { + "foreign_landing_url": foreign_landing_url, + "image_url": image_url, + "thumbnail_url": thumbnail_url, + "license_info": self.license_info, + "foreign_identifier": foreign_identifier, + "filesize": filesize, + "filetype": filetype, + "creator": creator, + "creator_url": creator_url, + "title": title, + "meta_data": meta_data, + "raw_tags": raw_tags, + "width": width, + "height": height, + } + + +def main(): + logger.info("Begin: Nappy data ingestion") + ingester = NappyDataIngester() + ingester.ingest_records() + + +if __name__ == "__main__": + main() diff --git a/openverse_catalog/dags/providers/provider_workflows.py b/openverse_catalog/dags/providers/provider_workflows.py index e668affef..b7f8302c4 100644 --- a/openverse_catalog/dags/providers/provider_workflows.py +++ b/openverse_catalog/dags/providers/provider_workflows.py @@ -13,6 +13,7 @@ from providers.provider_api_scripts.jamendo import JamendoDataIngester from providers.provider_api_scripts.metropolitan_museum import MetMuseumDataIngester from providers.provider_api_scripts.museum_victoria import VictoriaDataIngester +from providers.provider_api_scripts.nappy import NappyDataIngester from providers.provider_api_scripts.nypl import NyplDataIngester from providers.provider_api_scripts.phylopic import PhylopicDataIngester from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester @@ -160,6 +161,10 @@ def __post_init__(self): ingester_class=VictoriaDataIngester, start_date=datetime(2020, 1, 1), ), + ProviderWorkflow( + ingester_class=NappyDataIngester, + start_date=datetime(2022, 12, 1), + ), ProviderWorkflow( ingester_class=NyplDataIngester, start_date=datetime(2020, 1, 1), diff --git a/tests/dags/providers/provider_api_scripts/resources/nappy/images.json b/tests/dags/providers/provider_api_scripts/resources/nappy/images.json new file mode 100644 index 000000000..274f1789c --- /dev/null +++ b/tests/dags/providers/provider_api_scripts/resources/nappy/images.json @@ -0,0 +1,179 @@ +{ + "images": [ + { + "creator": "iamconnorrm", + "creator_url": "https://nappy.co/iamconnorrm", + "downloads": 1329, + "filesize": "233.5kB", + "filetype": "jpg", + "foreign_identifier": 9, + "foreign_landing_url": "https://nappy.co/photo/9/woman-with-tattoos", + "height": 1361, + "license": "CC0", + "saves": 18, + "tags": "indoor,bed,arthropod,dark,lobster,braids,female,red,blue,tattoo,earring,phone,laying,room", + "title": "woman with tattoos", + "url": "https://images.nappy.co/uploads/large/101591721349meykm7s6hvaswwvslpjrwibeyzru1fcxtxh0hf09cs7kdhmtptef4y3k4ua5z1bkyrbxov8tmagnafm8upwa3hxaxururtx7azaf.jpg", + "views": 82692, + "width": 2048 + }, + { + "creator": "iamconnorrm", + "creator_url": "https://nappy.co/iamconnorrm", + "downloads": 1371, + "filesize": "335.9kB", + "filetype": "jpg", + "foreign_identifier": 10, + "foreign_landing_url": "https://nappy.co/photo/10/woman-with-tattoos", + "height": 1568, + "license": "CC0", + "saves": 16, + "tags": "phone,thread,mobile phone,fiber,indoor,person,purple,braids,extensions,relaxed,female,woman,hoop earrings,tattoos,couch,laying,bedroom,home,chillen", + "title": "woman with tattoos", + "url": "https://images.nappy.co/uploads/large/1015917606841e4lhtkqwyaq2qo4rkrv2vnl30fss9ufdrknnxv26bivb8d36uk8qsdrwwh4bg4yrn6sjp23qulnomxnixlbaj1zqf5u5cpaqfw1.jpg", + "views": 50380, + "width": 2407 + }, + { + "creator": "NappyStock", + "creator_url": "https://nappy.co/NappyStock", + "downloads": 1082, + "filesize": "3.4MB", + "filetype": "jpg", + "foreign_identifier": 11, + "foreign_landing_url": "https://nappy.co/photo/11/women-drinking-coffee", + "height": 4480, + "license": "CC0", + "saves": 10, + "tags": "indoor,person,wall,human face,clothing,laptop,furniture,computer,woman,smile,sitting,table,living,people,happy,locs,haircut,female,women,excited,joyful,macbook,couch,coffee,tea,laughing,phone,working,office", + "title": "Women drinking coffee", + "url": "https://images.nappy.co/uploads/large/215917622316fvyx1szu6ntonn5qcmsj3jew8rctnhjmfbcehxwc19cegkluf6kjednm4goockgf9tzygdrktsbsted7fiizisslgyfpuzahenz.jpg", + "views": 26952, + "width": 6720 + }, + { + "creator": "michellclark", + "creator_url": "https://nappy.co/michellclark", + "downloads": 1670, + "filesize": "151.4kB", + "filetype": "jpg", + "foreign_identifier": 12, + "foreign_landing_url": "https://nappy.co/photo/12/man-smiling", + "height": 1620, + "license": "CC0", + "saves": 15, + "tags": "human face,person,clothing,man,smile,t-shirt,active shirt,top,facial hair,sleeve,baseball cap,black,headshot,hat,fitted,chain,dark ceasar,short cut,male,standing,portrait", + "title": "Man smiling", + "url": "https://images.nappy.co/uploads/large/71591899860fbb6p1q7ilg0n3utineujwjupktr7jgd5e8xpytrlt4exoefehfrswwtnli2ojd3uhnofd3703kd2f5rpldoqh1w79btgi8bqy8s.jpg", + "views": 69484, + "width": 1080 + }, + { + "creator": "_willpower_", + "creator_url": "https://nappy.co/_willpower_", + "downloads": 864, + "filesize": "281.1kB", + "filetype": "jpg", + "foreign_identifier": 13, + "foreign_landing_url": "https://nappy.co/photo/13/bearded-man", + "height": 2048, + "license": "CC0", + "saves": 4, + "tags": "human face,outdoor,person,clothing,human beard,sky,man,fashion accessory,goggles,eyewear,moustache,cool,facial hair,cap,headgear,wearing,hat,sunglasses,jacket,male,teal,white,grey,park,field,green,chain,lips", + "title": "Bearded man", + "url": "https://images.nappy.co/uploads/large/21631695923fzirwe1wuypepmxtaxomi7kiog5a42uycntranjkmc8kmd5lsdsgaek4mwla71r36e5sungsxbj8znn78i6s42ktsmygvypnluto.jpg", + "views": 9494, + "width": 1365 + }, + { + "creator": "samanthasophia", + "creator_url": "https://nappy.co/samanthasophia", + "downloads": 1257, + "filesize": "308.4kB", + "filetype": "jpg", + "foreign_identifier": 14, + "foreign_landing_url": "https://nappy.co/photo/14/couple-on-a-date", + "height": 1365, + "license": "CC0", + "saves": 11, + "tags": "outdoor,sky,clothing,mountain,person,footwear,ground,curls,hat,relaxed,in love,couple,male,female,sneakers,shirt,bag,statue,standing,leaning,hollywood,los angeles,high ground,date,valentine", + "title": "Couple on a date", + "url": "https://images.nappy.co/uploads/large/121591921493qrrtjwlzvfpprfkttctplh4kd3oi82mdxxlmf96vjrads0sw2pmlfitcan3y429hwwjt9r5chftvmkbchgkgpdxb8yjole0pdq9m.jpg", + "views": 52293, + "width": 2048 + }, + { + "creator": "olueletu", + "creator_url": "https://nappy.co/olueletu", + "downloads": 1709, + "filesize": "272.1kB", + "filetype": "jpg", + "foreign_identifier": 32, + "foreign_landing_url": "https://nappy.co/photo/32/man-using-ipad", + "height": 1356, + "license": "CC0", + "saves": 18, + "tags": "person,clothing,human face,man,glasses,wall,shirt,indoor,mobile phone,text,striped,office,work,beard,male,blue,shoes,socks,pants,ipad,pencil,graph,sitting,teaching,presenting,meeting,stripe,carpet", + "title": "Man using iPad", + "url": "https://images.nappy.co/uploads/large/111591931716tsvidfocoz0nvs8p9fjukivldja0dkupitil7mpt3hzwsgkv3vbe92qo0lykowc3m9kxl42wc1qlvptrkcufzcm2h6onlmcioblb.jpg", + "views": 51200, + "width": 2048 + }, + { + "creator": "olueletu", + "creator_url": "https://nappy.co/olueletu", + "downloads": 1497, + "filesize": "220.2kB", + "filetype": "jpg", + "foreign_identifier": 33, + "foreign_landing_url": "https://nappy.co/photo/33/man-sketching-on-a-paper", + "height": 1356, + "license": "CC0", + "saves": 9, + "tags": "person,indoor,clothing,furniture,learning,table,bag,red,blue,wood,desk,watch,pen,paper,draw,standing,work from home,office,work,books,meeting,presentation", + "title": "Man sketching on a paper", + "url": "https://images.nappy.co/uploads/large/111591931801ilojwrb5h2axljslvhqbr5xktgn2oobqqob9kcfjcp4vvmdhvetgnkrryiogxvejl8mfsanchpy0tyenyaqakznqbqt6jqjxeea7.jpg", + "views": 45896, + "width": 2048 + }, + { + "creator": "olueletu", + "creator_url": "https://nappy.co/olueletu", + "downloads": 1012, + "filesize": "238.9kB", + "filetype": "jpg", + "foreign_identifier": 34, + "foreign_landing_url": "https://nappy.co/photo/34/man-looking-at-his-watch", + "height": 1603, + "license": "CC0", + "saves": 3, + "tags": "outdoor,sky,person,fashion accessory,cloud,grass,orange,watch,sweater,horizon,male,man,telling time,by the water,river", + "title": "Man looking at his watch", + "url": "https://images.nappy.co/uploads/large/111591931914waxh1ygklshcuat0vclcrxw5eqwlb4vxl5alwejvesekxat1d50p742iel6hh76dfzcasrtwgrz7surifwgl6sw9ngls4pvzq1nr.jpg", + "views": 17954, + "width": 2048 + }, + { + "creator": "olueletu", + "creator_url": "https://nappy.co/olueletu", + "downloads": 1119, + "filesize": "101.5kB", + "filetype": "jpg", + "foreign_identifier": 35, + "foreign_landing_url": "https://nappy.co/photo/35/man-playing-the-piano", + "height": 1356, + "license": "CC0", + "saves": 11, + "tags": "piano,person,musical keyboard,keyboard,keyboard player,clothing,electronic keyboard,electric piano,music,digital piano,electronic instrument,pianist,indoor,laptop,watch,shirt,purple,jazz,band,male,man,drums,studio", + "title": "Man playing the piano", + "url": "https://images.nappy.co/uploads/large/111591931994kkgbuerofxyi97kpps1rhubhp4lefwittst3zmhdrfjrfnfub3s94kfjyyxjnv0hjvli17giknovw6j79348blwktvl1n1i9zw5s.jpg", + "views": 20930, + "width": 2048 + } + ], + "next_page": "https://api.nappy.co/v1/openverse/images?page=2", + "page": 1, + "per_page": 10, + "total_pages": 206, + "total_results": 2059 +} diff --git a/tests/dags/providers/provider_api_scripts/resources/nappy/single_item.json b/tests/dags/providers/provider_api_scripts/resources/nappy/single_item.json new file mode 100644 index 000000000..873ea8bbe --- /dev/null +++ b/tests/dags/providers/provider_api_scripts/resources/nappy/single_item.json @@ -0,0 +1,17 @@ +{ + "creator": "iamconnorrm", + "creator_url": "https://nappy.co/iamconnorrm", + "downloads": 1329, + "filesize": "233.5kB", + "filetype": "jpg", + "foreign_identifier": 9, + "foreign_landing_url": "https://nappy.co/photo/9/woman-with-tattoos", + "height": 1361, + "license": "CC0", + "saves": 18, + "tags": "indoor,bed,arthropod,dark,lobster,braids,female,red,blue,tattoo,earring,phone,laying,room", + "title": "woman with tattoos", + "url": "https://images.nappy.co/uploads/large/101591721349meykm7s6hvaswwvslpjrwibeyzru1fcxtxh0hf09cs7kdhmtptef4y3k4ua5z1bkyrbxov8tmagnafm8upwa3hxaxururtx7azaf.jpg", + "views": 82692, + "width": 2048 +} diff --git a/tests/dags/providers/provider_api_scripts/test_nappy.py b/tests/dags/providers/provider_api_scripts/test_nappy.py new file mode 100644 index 000000000..6341d6e86 --- /dev/null +++ b/tests/dags/providers/provider_api_scripts/test_nappy.py @@ -0,0 +1,155 @@ +import json +from ast import literal_eval +from pathlib import Path + +import pytest +from common.constants import IMAGE +from common.licenses import get_license_info +from providers.provider_api_scripts.nappy import NappyDataIngester + + +# resource files +RESOURCES = Path(__file__).parent / "resources/nappy" +FULL_BATCH_RESPONSE = json.loads((RESOURCES / "images.json").read_text()) +SINGLE_ITEM = literal_eval((RESOURCES / "single_item.json").read_text()) + +# Set up test class +ingester = NappyDataIngester() + + +@pytest.mark.parametrize( + "previous, expected_result", + [ + pytest.param( + None, {"per_page": ingester.batch_limit, "page": 1}, id="default_response" + ), + pytest.param( + {"per_page": ingester.batch_limit, "page": 42}, + {"per_page": ingester.batch_limit, "page": 43}, + id="basic_increment", + ), + pytest.param( + {"thing1": "some", "thing2": "data", "page": 0}, + {"thing1": "some", "thing2": "data", "page": 1}, + id="other_parameters", + ), + ], +) +def test_get_next_query_params(previous, expected_result): + actual_result = ingester.get_next_query_params(previous) + assert actual_result == expected_result + + +# this is based on the assumption that Nappy will only ever send us image data +@pytest.mark.parametrize( + "record", + [None, {}, {"here is": "some data"}], +) +def test_get_media_type(record): + expected_result = IMAGE + actual_result = ingester.get_media_type(record) + assert actual_result == expected_result + + +@pytest.mark.parametrize( + "response_json, expected", + [ + pytest.param( + FULL_BATCH_RESPONSE, + FULL_BATCH_RESPONSE["images"], + id="happy_path", + ), + pytest.param({}, None, id="empty_dict"), + pytest.param(None, None, id="None"), + ], +) +def test_get_batch_data(response_json, expected): + actual = ingester.get_batch_data(response_json) + assert actual == expected + + +@pytest.mark.parametrize( + "response_json, expected_result", + [ + ({}, False), + (FULL_BATCH_RESPONSE, True), + (SINGLE_ITEM, False), + ], +) +def test_get_should_continue(response_json, expected_result): + actual_result = ingester.get_should_continue(response_json) + assert actual_result == expected_result + + +# def get_record_data(self, data: dict) -> dict | list[dict] | None: +@pytest.mark.parametrize( + "response_json, expected_data", + [ + pytest.param({}, None, id="empty_dict"), + pytest.param(FULL_BATCH_RESPONSE, None, id="no_urls"), + pytest.param( + SINGLE_ITEM, + { + "foreign_landing_url": "https://nappy.co/photo/9/woman-with-tattoos", + "image_url": "https://images.nappy.co/uploads/large/101591721349meykm7s6hvaswwvslpjrwibeyzru1fcxtxh0hf09cs7kdhmtptef4y3k4ua5z1bkyrbxov8tmagnafm8upwa3hxaxururtx7azaf.jpg", + "license_info": get_license_info( + "https://creativecommons.org/publicdomain/zero/1.0/" + ), + "foreign_identifier": 9, + "filesize": 233500, + "filetype": "jpg", + "creator": "iamconnorrm", + "creator_url": "https://nappy.co/iamconnorrm", + "title": "woman with tattoos", + "thumbnail_url": "https://images.nappy.co/uploads/large/101591721349meykm7s6hvaswwvslpjrwibeyzru1fcxtxh0hf09cs7kdhmtptef4y3k4ua5z1bkyrbxov8tmagnafm8upwa3hxaxururtx7azaf.jpg?auto=format&w=600&q=75", + "meta_data": { + "views": 82692, + "saves": 18, + "downloads": 1329, + }, + "raw_tags": [ + "indoor", + "bed", + "arthropod", + "dark", + "lobster", + "braids", + "female", + "red", + "blue", + "tattoo", + "earring", + "phone", + "laying", + "room", + ], + "width": 2048, + "height": 1361, + }, + id="happy_path", + ), + ], +) +def test_get_record_data(response_json, expected_data): + actual_data = ingester.get_record_data(response_json) + assert actual_data == expected_data + + +@pytest.mark.parametrize( + "raw_filesize_string, expected_result", + [ + pytest.param("4kB", 4_000, id="happy_kB"), + pytest.param("4MB", 4_000_000, id="happy_MB"), + pytest.param("4GB", 4_000_000_000, id="happy_GB"), + pytest.param("", None, id="empty_string"), + pytest.param([], None, id="not_a_string"), + pytest.param("gibberish", None, id="gibberish"), + pytest.param("10.3kB", 10_300, id="decimal"), + pytest.param("10.12345kB", 10_123, id="rounding"), + pytest.param(" 4 kB ", 4_000, id="extra_spaces"), + ], +) +def test_convert_filesize(raw_filesize_string, expected_result): + # this is a static method, so not using the instance for testing + actual_result = NappyDataIngester._convert_filesize(raw_filesize_string) + assert actual_result == expected_result