Skip to content

Commit

Permalink
Add justtakeitfree as an image provider
Browse files Browse the repository at this point in the history
  • Loading branch information
obulat committed Aug 7, 2023
1 parent 61c7abd commit 68a860f
Show file tree
Hide file tree
Showing 6 changed files with 171 additions and 0 deletions.
1 change: 1 addition & 0 deletions catalog/dags/common/loader/provider_details.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
FREESOUND_DEFAULT_PROVIDER = "freesound"
INATURALIST_DEFAULT_PROVIDER = "inaturalist"
JAMENDO_DEFAULT_PROVIDER = "jamendo"
JUSTTAKEITFREE_IMAGE_PROVIDER = "justtakeitfree"
METROPOLITAN_MUSEUM_DEFAULT_PROVIDER = "met"
NAPPY_DEFAULT_PROVIDER = "nappy"
NYPL_DEFAULT_PROVIDER = "nypl"
Expand Down
92 changes: 92 additions & 0 deletions catalog/dags/providers/provider_api_scripts/justtakeitfree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""
Content Provider: Justtakeitfree
ETL Process: Use the API to identify all CC licensed media.
Output: TSV file containing the media and the
respective meta-data.
Notes: https://justtakeitfree.com/api/api.php
"""
import logging

from airflow.models import Variable

from common.constants import IMAGE
from common.licenses import get_license_info
from common.loader import provider_details as prov
from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester


logger = logging.getLogger(__name__)


class JusttakeitfreeDataIngester(ProviderDataIngester):
providers = {
"image": prov.JUSTTAKEITFREE_IMAGE_PROVIDER,
}
endpoint = "https://justtakeitfree.com/api/api.php"

def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict:
# On the first request, `prev_query_params` will be `None`. We can detect this
# and return our default params.
if not prev_query_params:
return {"page": 0, "key": Variable.get("API_KEY_JUSTTAKEITFREE")}
else:
return {
**prev_query_params,
"page": prev_query_params["page"] + 1,
}

def get_batch_data(self, response_json):
if response_json:
return response_json.get("data")[0]
return None

def get_media_type(self, record: dict):
return IMAGE

def get_record_data(self, data: dict) -> dict | list[dict] | None:
if not (foreign_identifier := data.get("page_link", "").split("/")[-2]):
logger.debug("Skipping record with missing foreign_identifier")
return None

if not (foreign_landing_url := data.get("page_link")):
logger.debug("Skipping record with missing foreign landing url")
return None

if not (url := data.get("full_image_link")):
logger.debug("Skipping record with missing url")
return None

# Use the `get_license_info` utility to get license information from a URL.
license_url = data.get("license_link")
license_info = get_license_info(license_url)
if license_info is None:
return None

creator = "Justtakeitfree Free Photos"
creator_url = "https://justtakeitfree.com"
raw_tags = data.get("tags")

return {
"foreign_landing_url": foreign_landing_url,
"url": url,
"license_info": license_info,
"foreign_identifier": foreign_identifier,
# Optional fields
"creator": creator,
"creator_url": creator_url,
"raw_tags": raw_tags,
}


def main():
# Allows running ingestion from the CLI without Airflow running for debugging
# purposes.
ingester = JusttakeitfreeDataIngester()
ingester.ingest_records()


if __name__ == "__main__":
main()
4 changes: 4 additions & 0 deletions catalog/dags/providers/provider_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from providers.provider_api_scripts.freesound import FreesoundDataIngester
from providers.provider_api_scripts.inaturalist import INaturalistDataIngester
from providers.provider_api_scripts.jamendo import JamendoDataIngester
from providers.provider_api_scripts.justtakeitfree import JusttakeitfreeDataIngester
from providers.provider_api_scripts.metropolitan_museum import MetMuseumDataIngester
from providers.provider_api_scripts.museum_victoria import VictoriaDataIngester
from providers.provider_api_scripts.nappy import NappyDataIngester
Expand Down Expand Up @@ -235,6 +236,9 @@ def __post_init__(self):
ProviderWorkflow(
ingester_class=JamendoDataIngester,
),
ProviderWorkflow(
ingester_class=JusttakeitfreeDataIngester,
),
ProviderWorkflow(
ingester_class=MetMuseumDataIngester,
start_date=datetime(2016, 9, 1),
Expand Down
1 change: 1 addition & 0 deletions catalog/env.template
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ AIRFLOW_VAR_API_KEY_EUROPEANA=not_set
AIRFLOW_VAR_API_KEY_FLICKR=not_set
AIRFLOW_VAR_API_KEY_FREESOUND=not_set
AIRFLOW_VAR_API_KEY_JAMENDO=not_set
AIRFLOW_VAR_API_KEY_JUSTTAKEITFREE=not_set
AIRFLOW_VAR_API_KEY_NYPL=not_set
AIRFLOW_VAR_API_KEY_RAWPIXEL=not_set
AIRFLOW_VAR_API_KEY_THINGIVERSE=not_set
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"credit_requirements": "This image is published under <a href=\"https://creativecommons.org/licenses/by/4.0/deed.en\" target=\"_blank\">CC BY 4.0 licence</a>.<br />Free for commercial use. Attribution required.<br />Please, credit: &copy; Justtakeitfree Free Photos (CC BY 4.0)<br />An active hyperlink to the page should be provided.",
"credit_text": "&copy; Justtakeitfree Free Photos (CC BY 4.0)",
"full_image_link": "https://justtakeitfree.com/photos/2.jpg",
"license": "(CC BY 4.0)",
"license_link": "https://creativecommons.org/licenses/by/4.0/deed.en",
"page_link": "https://justtakeitfree.com/photo/2/",
"preview_link": "https://justtakeitfree.com/photos/2_800.jpg",
"tags": ["Baturyn fortress", "Baturyn citadel", "cossack fortress"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""
TODO: Add additional tests for any methods you added in your subclass.
Try to test edge cases (missing keys, different data types returned, Nones, etc).
You may also need to update the given test names to be more specific.
Run your tests locally with `just test -k justtakeitfree`
"""

import json
from pathlib import Path

from common.licenses import get_license_info
from providers.provider_api_scripts.justtakeitfree import JusttakeitfreeDataIngester


# TODO: API responses used for testing can be added to this directory
RESOURCES = Path(__file__).parent / "resources/justtakeitfree"

# Set up test class
ingester = JusttakeitfreeDataIngester()


def test_get_next_query_params_default_response():
actual_result = ingester.get_next_query_params(None)
actual_result.pop("key", None)
expected_result = {
"page": 0,
}
assert actual_result == expected_result


def test_get_next_query_params_updates_parameters():
previous_query_params = {
"page": 1,
}
actual_result = ingester.get_next_query_params(previous_query_params)
actual_result.pop("key", None)

expected_result = {
"page": 2,
}
assert actual_result == expected_result


def test_get_record_data():
with open(RESOURCES / "single_item.json") as f:
resource_json = json.load(f)

actual_data = ingester.get_record_data(resource_json)

expected_data = {
"foreign_landing_url": "https://justtakeitfree.com/photo/2/",
"url": "https://justtakeitfree.com/photos/2.jpg",
"foreign_identifier": "2",
"creator": "Justtakeitfree Free Photos",
"creator_url": "https://justtakeitfree.com",
"license_info": get_license_info(
"https://creativecommons.org/licenses/by/4.0/deed.en"
),
"raw_tags": ["Baturyn fortress", "Baturyn citadel", "cossack fortress"],
}

assert actual_data == expected_data

0 comments on commit 68a860f

Please sign in to comment.